combine 2 df
dim(test)
## [1] 1459 80
str(train)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1460 obs. of 81 variables:
## $ Id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : num 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : num 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : num 8450 9600 11250 9550 14260 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : num 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : num 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd : num 2003 1976 2002 1970 2000 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : num 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : num 706 978 486 216 655 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : num 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : num 856 1262 920 756 1145 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stFlrSF : num 856 1262 920 961 1145 ...
## $ 2ndFlrSF : num 854 0 866 756 1053 ...
## $ LowQualFinSF : num 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num 1710 1262 1786 1717 2198 ...
## $ BsmtFullBath : num 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : num 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : num 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : num 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : num 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : num 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : num 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : num 2003 1976 2001 1998 2000 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : num 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : num 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : num 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: num 0 0 0 272 0 0 0 228 205 0 ...
## $ 3SsnPorch : num 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : num 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : num 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : num 2008 2007 2008 2006 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num 208500 181500 223500 140000 250000 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. MSSubClass = col_double(),
## .. MSZoning = col_character(),
## .. LotFrontage = col_double(),
## .. LotArea = col_double(),
## .. Street = col_character(),
## .. Alley = col_character(),
## .. LotShape = col_character(),
## .. LandContour = col_character(),
## .. Utilities = col_character(),
## .. LotConfig = col_character(),
## .. LandSlope = col_character(),
## .. Neighborhood = col_character(),
## .. Condition1 = col_character(),
## .. Condition2 = col_character(),
## .. BldgType = col_character(),
## .. HouseStyle = col_character(),
## .. OverallQual = col_double(),
## .. OverallCond = col_double(),
## .. YearBuilt = col_double(),
## .. YearRemodAdd = col_double(),
## .. RoofStyle = col_character(),
## .. RoofMatl = col_character(),
## .. Exterior1st = col_character(),
## .. Exterior2nd = col_character(),
## .. MasVnrType = col_character(),
## .. MasVnrArea = col_double(),
## .. ExterQual = col_character(),
## .. ExterCond = col_character(),
## .. Foundation = col_character(),
## .. BsmtQual = col_character(),
## .. BsmtCond = col_character(),
## .. BsmtExposure = col_character(),
## .. BsmtFinType1 = col_character(),
## .. BsmtFinSF1 = col_double(),
## .. BsmtFinType2 = col_character(),
## .. BsmtFinSF2 = col_double(),
## .. BsmtUnfSF = col_double(),
## .. TotalBsmtSF = col_double(),
## .. Heating = col_character(),
## .. HeatingQC = col_character(),
## .. CentralAir = col_character(),
## .. Electrical = col_character(),
## .. `1stFlrSF` = col_double(),
## .. `2ndFlrSF` = col_double(),
## .. LowQualFinSF = col_double(),
## .. GrLivArea = col_double(),
## .. BsmtFullBath = col_double(),
## .. BsmtHalfBath = col_double(),
## .. FullBath = col_double(),
## .. HalfBath = col_double(),
## .. BedroomAbvGr = col_double(),
## .. KitchenAbvGr = col_double(),
## .. KitchenQual = col_character(),
## .. TotRmsAbvGrd = col_double(),
## .. Functional = col_character(),
## .. Fireplaces = col_double(),
## .. FireplaceQu = col_character(),
## .. GarageType = col_character(),
## .. GarageYrBlt = col_double(),
## .. GarageFinish = col_character(),
## .. GarageCars = col_double(),
## .. GarageArea = col_double(),
## .. GarageQual = col_character(),
## .. GarageCond = col_character(),
## .. PavedDrive = col_character(),
## .. WoodDeckSF = col_double(),
## .. OpenPorchSF = col_double(),
## .. EnclosedPorch = col_double(),
## .. `3SsnPorch` = col_double(),
## .. ScreenPorch = col_double(),
## .. PoolArea = col_double(),
## .. PoolQC = col_character(),
## .. Fence = col_character(),
## .. MiscFeature = col_character(),
## .. MiscVal = col_double(),
## .. MoSold = col_double(),
## .. YrSold = col_double(),
## .. SaleType = col_character(),
## .. SaleCondition = col_character(),
## .. SalePrice = col_double()
## .. )
test_labels <- test$Id
test$Id <- NULL
train$Id <- NULL
test$SalePrice <- NA
##col name不同,無法rbind
which(names(test)!=names(train))
## [1] 43 44 69
colnames(test)[c(43,44,69)]<-colnames(train)[c(43,44,69)]
##?rbind= combine object by rows
all<- rbind(train, test)
dim(all)
## [1] 2919 80
attach(all)
check the dependent, sales price
ggplot(all, aes(SalePrice))+
geom_histogram(fill='blue', binwidth = 10000)+
scale_x_continuous(breaks= seq(0,800000,100000), labels = comma)

##found: right skewed, 大多數人都買便宜房子
summary(all$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 34900 129975 163000 180921 214000 755000 1459
check which numerical has the highest cor with dependent
#index vector of the numerical Vars
numericVar<-which(sapply(all,is.numeric))
# sapply(obj, func): 返回一個vector;
# 這個vector的長度=obj的長度,
# 然後sapply自己對obj中的每個小物件執行給定的func,
# func執行後,產生的每一個回應(新的小物件),
# 都會被丟回上面產生的vector的相應編號中
# eg. 這裏是對all裡面的每一個物件,先產生一個相應的數字(index, vector)(有name), 然後對每個all執行is.numeric,會返回一個布林;
# 把這個布林丟到which中,which只會選TRUE
class(numericVar)
## [1] "integer"
# 存著名字待會用
numericVarNames<- names(numericVar)
allNumVar<-all[,numericVar]
##!!!!! 畫相互cor矩陣
corNumVar<-cor(allNumVar, use="pairwise.complete.obs")
corSorted<- as.matrix(sort(corNumVar[,'SalePrice'],decreasing = TRUE))
# CorHigh<-names(which(corSorted>0.5))
CorHigh<-names(which(apply(corSorted,1,function(x) abs(x)>0.5)))
##?apply=
##?abs=算絕對值
corNumVarSorted<-corNumVar[CorHigh,CorHigh]
corrplot.mixed(corNumVarSorted,tl.col='black',tl.pos = 'lt')

explore the highest cor variable OverallQuality
# use box plot to check the subgroup relatio
ggplot(all[!is.na(SalePrice),], aes(factor(OverallQual),SalePrice))+
geom_boxplot(col='blue')+labs(x='overall quality')+
scale_y_continuous(breaks = seq(0,800000,100000),labels=comma)

##clear path of correlation bt saleprice and qverall quality, few outliers and no extreme values
explore the second highest cor variable, Above grade (ground) living area square feet
ggplot(all[!is.na(SalePrice),],aes(GrLivArea,SalePrice))+
geom_point(col='blue',position = 'jitter', alpha=0.25)+
scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)+
geom_smooth(method=lm, col='black',aes(group=1))+
##?group=
geom_text_repel(aes(label=ifelse(GrLivArea[!is.na(SalePrice)]>4500,rownames(all),'')))

##?geom_text_repel: 直接在圖上加上點的數值
#ifelse(test, yes, no)= 給一個布林(test),如果是true,返回(yes),是false,返回(no)
##?rownames: 返回每個row的編號(PK)(其實電腦內建有一個編號,只是沒有顯示出來);For a data frame, value for rownames should be a character vector of non-duplicated and non-missing names (this is enforced), and for colnames a character vector of (preferably) unique syntactically-valid names.
###綜合這行的意思就是:把圖上每個點都寫上字,寫什麼?label,
###如果某row"SP不是na"且Gr>4500,返回yes,而yes的label是 那個row的編號
###反之,返回no,no的label是空字串
all[c(524,1299),c('SalePrice', 'GrLivArea','OverallQual')]
##發現有兩間房子很大但很便宜,可能是outliers,但先考慮是否是overal quality的問題,結果叫出來,發現quality 也很好,所以要把這兩筆放到outliers清單
##很強的線性關係,se也很小
check the missing values
##找有missing value 的col( index )
NAcol<-which(colSums(is.na(all))>0)
##which= obj中挑出true的
##此處which 底下的數字是index不是數量!!!!
##is.na驗證每一個value並返回布林,只要這個col有一個na,那計算colsum時就會+1
NaNum<-sort(colSums(sapply(all[NAcol],is.na)),decreasing = TRUE)
##寫成func:
NaNum<-function(df){
sort(colSums(sapply(all[which(colSums(is.na(df))>0)],is.na)),decreasing = TRUE)
}
cat('There are', length(NAcol), 'columns with missing values')
## There are 35 columns with missing values
##fix NA in 34 predictors( on column is SalesPrice)
all$PoolQC[is.na(all$PoolQC)]<-'None'
table(all$PoolQC)
##
## Ex Fa Gd None
## 4 2 4 2909
Qualities<- c('None'=0,'Po'=1,'Fa'=2,'TA'=3,'Gd'=4,'Ex'=5)
all$PoolQC<-as.integer(plyr::revalue(all$PoolQC,Qualities))
#?plyr::revalue(obj1,obj2), 把obj1的named level換成obj2
table(all$PoolQC)
##
## 0 2 4 5
## 2909 2 4 4
##下面這裡把all$PoolArea>0 &all$PoolQC == 0 的3rows 重新賦值
all[all$PoolArea>0 &all$PoolQC == 0, "PoolQC"]<-c(2,3,2)
table(all$PoolQC)
##
## 0 2 3 4 5
## 2906 4 1 4 4
Imputing missing data: Miscellaneous Feature
table(all$MiscFeature)
##
## Gar2 Othr Shed TenC
## 5 4 95 1
all$MiscFeature[is.na(all$MiscFeature)]='None'
all$MiscFeature<-as.factor(all$MiscFeature)
ggplot(all[!is.na(all$SalePrice),],aes(MiscFeature,SalePrice))+
geom_bar(stat = 'summary', fun.y='median',fill='blue')+
##?stat=summary: x分類中的平均y值
scale_y_continuous(breaks=seq(0,800000,50000),labels = comma)+
#######下面這串看不懂,?geom_label:
geom_label(stat='count',aes(label=..count..,y=..count..))

##seems like MiscFeatures are irrelevnt to saleprice, eventhough house with Tennis court is much more expensive, but there is only one house with TC; House with Shed are cheaper, but it probably because shed is alternative of garrage.
Imputing missing data: LotFrontage
##LotFrontage
which(is.na(all$LotFrontage))
## [1] 8 13 15 17 25 32 43 44 51 65 67 77 85 96
## [15] 101 105 112 114 117 121 127 132 134 137 148 150 153 154
## [29] 161 167 170 171 178 181 187 192 204 208 209 215 219 222
## [43] 235 238 245 250 270 288 289 294 308 309 311 320 329 331
## [57] 336 343 347 348 352 357 361 362 365 367 370 371 376 385
## [71] 393 394 405 406 413 422 427 448 453 458 459 460 466 471
## [85] 485 491 497 517 519 530 538 539 540 542 546 560 561 565
## [99] 570 581 594 611 612 613 617 624 627 642 646 661 667 669
## [113] 673 680 683 686 688 691 707 710 715 721 722 727 735 746
## [127] 747 752 758 771 784 786 790 792 795 812 817 818 823 829
## [141] 841 846 852 854 856 857 860 866 869 880 883 894 901 905
## [155] 909 912 918 926 928 929 930 940 942 945 954 962 968 976
## [169] 981 984 989 997 998 1004 1007 1018 1019 1025 1031 1033 1034 1036
## [183] 1038 1042 1046 1058 1060 1065 1078 1085 1087 1098 1109 1111 1117 1123
## [197] 1125 1139 1142 1144 1147 1149 1154 1155 1162 1165 1178 1181 1191 1194
## [211] 1207 1214 1231 1234 1245 1248 1252 1254 1261 1263 1269 1271 1272 1273
## [225] 1277 1278 1287 1288 1291 1301 1302 1310 1313 1319 1322 1343 1347 1349
## [239] 1355 1357 1358 1359 1363 1366 1369 1374 1382 1384 1397 1408 1418 1420
## [253] 1424 1425 1430 1432 1442 1444 1447 1467 1501 1502 1506 1508 1513 1520
## [267] 1536 1543 1559 1564 1566 1568 1574 1580 1585 1593 1607 1613 1628 1635
## [281] 1638 1640 1643 1644 1645 1648 1649 1660 1690 1691 1692 1696 1699 1701
## [295] 1729 1732 1733 1734 1735 1737 1738 1740 1741 1744 1747 1751 1755 1758
## [309] 1759 1762 1769 1820 1824 1834 1841 1844 1847 1848 1849 1862 1863 1864
## [323] 1873 1879 1882 1884 1886 1903 1911 1912 1923 1937 1942 1946 1948 1950
## [337] 1956 1958 1985 1986 1989 1990 1993 1997 2000 2024 2030 2031 2040 2042
## [351] 2043 2045 2050 2053 2065 2075 2111 2112 2123 2129 2132 2138 2141 2142
## [365] 2143 2147 2149 2156 2158 2164 2165 2167 2168 2171 2172 2174 2175 2176
## [379] 2179 2187 2203 2204 2205 2223 2224 2235 2242 2246 2251 2254 2255 2258
## [393] 2259 2269 2280 2301 2321 2326 2328 2358 2362 2373 2380 2388 2390 2396
## [407] 2397 2404 2422 2424 2432 2447 2460 2467 2481 2484 2485 2491 2492 2493
## [421] 2494 2499 2513 2515 2522 2531 2535 2536 2548 2568 2569 2571 2573 2584
## [435] 2587 2588 2595 2597 2598 2603 2607 2612 2615 2617 2618 2621 2626 2635
## [449] 2637 2663 2673 2674 2677 2678 2681 2684 2685 2701 2704 2705 2707 2708
## [463] 2709 2710 2715 2716 2725 2728 2738 2739 2742 2765 2808 2811 2812 2813
## [477] 2815 2816 2819 2840 2846 2848 2851 2901 2902 2909
ggplot(all[!is.na(all$LotFrontage),],aes(as.factor(Neighborhood),LotFrontage))+
geom_bar(stat='summary',fun.y='median',fill='blue')+
xlab('neighborhood')+
theme(axis.text.x= element_text(angle = 45,hjust = 1))

##theme 調整底下label的顯示方式,讓它不會疊在一起,angle表示45度斜角顯示,hjust表示下移一單位
for (i in 1:nrow(all)){
if(is.na(all$LotFrontage[i])){
all$LotFrontage[i]<-
as.integer(median(all$LotFrontage[all$Neighborhood==all$Neighborhood[i]],na.rm=TRUE))
}
}
##把lotfrontage 裡面的missing value 換成中位數
for (i in 1:nrow(all)){
if(is.na(all$LotFrontage[i])){
all$LotFrontage[i] <- as.integer(median(all$LotFrontage[all$Neighborhood==all$Neighborhood[i]], na.rm=TRUE))
}
}
sum(is.na(all$LotFrontage))
## [1] 0
Imputing missing data: LotShape
table(all$LotShape)
##
## IR1 IR2 IR3 Reg
## 968 76 16 1859
sum(is.na(LotShape))
## [1] 0
all$LotShape<- as.integer(plyr::revalue(all$LotShape,c('IR3'=0, 'IR2'=1,'IR1'=2,'Reg'=3)))
table(all$LotShape)
##
## 0 1 2 3
## 16 76 968 1859
ggplot(all[!is.na(SalePrice),], aes(as.factor(LotShape),SalePrice))+
geom_boxplot()+xlab('LotShape')+
scale_y_continuous(labels = comma)

Imputing missing data: LotConfig
table(all$LotConfig)
##
## Corner CulDSac FR2 FR3 Inside
## 511 176 85 14 2133
ggplot(all[!is.na(all$SalePrice),], aes(as.factor(LotConfig),SalePrice))+
geom_bar(stat='summary',fun.y='median',fill='blue')+
scale_y_continuous(breaks=seq(0,800000,50000),labels = comma)+
geom_label(stat='count',aes(label=..count..,y=..count..))

Imputing missing data: Garage
all$GarageYrBlt[is.na(all$GarageYrBlt)] <- all$YearBuilt[is.na(all$GarageYrBlt)]
# As NAs mean ‘No Garage’ for character variables, I now want to find out where the differences between the 157 NA GarageType and the other 3 character variables with 159 NAs come from.
#check if all 157 NAs are the same observations among the variables with 157/159 NAs
length(which(is.na(all$GarageType)&is.na(all$GarageFinish)&is.na(all$GarageCond)& is.na(all$GarageQual)))
## [1] 157
#Find the 2 additional NAs
##?kable
kable(all[!is.na(GarageType)&is.na(GarageFinish),c('GarageCars', 'GarageArea', 'GarageType', 'GarageCond', 'GarageQual', 'GarageFinish')])
| 1 |
360 |
Detchd |
NA |
NA |
NA |
| NA |
NA |
Detchd |
NA |
NA |
NA |
##check the row index
which(!is.na(GarageType)&is.na(GarageFinish))
## [1] 2127 2577
#The 157 NAs within GarageType all turn out to be NA in GarageCondition, GarageQuality, and GarageFinish as well. The differences are found in houses 2127 and 2577. As you can see, house 2127 actually does seem to have a Garage and house 2577 does not. Therefore, there should be 158 houses without a Garage. To fix house 2127, I will imputate the most common values (modes) for GarageCond, GarageQual, and GarageFinish.
# imputate 2127 the most common values (modes) for GarageCond, GarageQual, and GarageFinish.
#在table前加一個負號,這樣table本來是從少到多排序,就會被反過來,就能選第一個(是最多)
names(sort(-table(all$GarageCond)))[1]
## [1] "TA"
names(sort(-table(all$GarageQual)))[1]
## [1] "TA"
names(sort(-table(all$GarageFinish)))[1]
## [1] "Unf"
all[2127,c('GarageCond',"GarageQual","GarageFinish")]<-
c(names(sort(-table(all$GarageCond)))[1],
names(sort(-table(all$GarageQual)))[1],
names(sort(-table(all$GarageFinish)))[1])
kable(all[2127,c('GarageYrBlt', 'GarageCars', 'GarageArea', 'GarageType', 'GarageCond', 'GarageQual', 'GarageFinish')])
| 1910 |
1 |
360 |
Detchd |
TA |
TA |
Unf |
#Both have 1 NA. As you can see above, it is house 2577 for both variables. The problem probably occured as the GarageType for this house is “detached”, while all other Garage-variables seem to indicate that this house has no Garage.
which(is.na(GarageCars))
## [1] 2577
which(is.na(GarageArea))
## [1] 2577
which(is.na(GarageType))
## [1] 40 49 79 89 90 100 109 126 128 141 149 156 164 166
## [15] 199 211 242 251 288 292 308 376 387 394 432 435 442 465
## [29] 496 521 529 534 536 563 583 614 615 621 636 637 639 650
## [43] 706 711 739 751 785 827 844 922 943 955 961 969 971 977
## [57] 1010 1012 1031 1039 1097 1124 1132 1138 1144 1174 1180 1219 1220 1235
## [71] 1258 1284 1324 1326 1327 1338 1350 1408 1450 1451 1454 1514 1532 1540
## [85] 1553 1557 1559 1561 1591 1594 1595 1615 1616 1718 1722 1788 1809 1811
## [99] 1812 1820 1823 1832 1835 1837 1840 1848 1894 2011 2082 2091 2094 2097
## [113] 2100 2105 2136 2152 2154 2190 2191 2192 2193 2194 2213 2239 2247 2354
## [127] 2355 2399 2400 2423 2427 2553 2554 2558 2576 2580 2604 2610 2692 2694
## [141] 2709 2768 2772 2790 2792 2800 2860 2863 2871 2889 2892 2893 2894 2910
## [155] 2914 2915 2918
which(is.na(all[,c('GarageCars','GarageArea')]))
## [1] 2577 5496
all[c(2577,5946),c('GarageCars','GarageArea',"GarageCond","GarageFinish","GarageYrBlt","GarageQual","GarageType")]
#fixing 3 values for house 2577
all[2577,c("GarageCars","GarageArea","GarageType")]<-c(0,0,NA)
#check if NAs of the character variables are now all 158
length(which(is.na(all$GarageType) & is.na(all$GarageFinish) & is.na(all$GarageCond) & is.na(all$GarageQual)))
## [1] 158
#Now, the 4 character variables related to garage all have the same set of 158 NAs, which correspond to ‘No Garage’. I will fix all of them in the remainder of this section
#The values do not seem ordinal, so I will convert into a factor.
all$GarageType[is.na(all$GarageType)]<-'No Garage'
all$GarageType<-as.factor(all$GarageType)
table(all$GarageType)
##
## 2Types Attchd Basment BuiltIn CarPort Detchd No Garage
## 23 1723 36 186 15 778 158
all$GarageFinish[is.na(all$GarageFinish)]<-'None'
Finish<-c('None'=0,'Unf'=1,'RFn'=2,'Fin'=3)
all$GarageFinish<-as.integer(plyr::revalue(all$GarageFinish,Finish))
table(all$GarageFinish)
##
## 0 1 2 3
## 158 1231 811 719
all$GarageQual[is.na(all$GarageQual)]<-'None'
all$GarageQual<-as.integer(plyr::revalue(all$GarageQual,Qualities))
table(all$GarageQual)
##
## 0 1 2 3 4 5
## 158 5 124 2605 24 3
all$GarageCond[is.na(all$GarageCond)]<-'None'
all$GarageCond<-as.integer((plyr::revalue(all$GarageCond,Qualities)))
table(all$GarageCond)
##
## 0 1 2 3 4 5
## 158 14 74 2655 15 3
Imputing missing data: Alley
all$Alley[is.na(Alley)]<-'None'
all$Alley<-as.factor(all$Alley)
table(all$Alley)
##
## Grvl None Pave
## 120 2721 78
ggplot(all[!is.na(SalePrice),],aes(Alley,SalePrice))+
geom_bar(stat='summary',fun.y='median',fill='blue')+
scale_y_continuous(breaks=seq(0,200000,50000),labels=comma)

##pave對價格影響不顯著, grvl 價格較低(是否有其他特徵影響??)
factor(all$Fence)
## [1] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA>
## [12] <NA> <NA> <NA> GdWo GdPrv <NA> <NA> <NA> MnPrv <NA> GdPrv
## [23] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> MnPrv MnPrv <NA>
## [34] <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA> MnPrv MnPrv
## [45] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> MnPrv <NA> <NA> MnPrv
## [56] <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> GdPrv GdPrv <NA>
## [67] <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA> GdWo <NA>
## [78] MnPrv <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [89] MnPrv <NA> <NA> GdWo <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [100] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv
## [111] <NA> <NA> <NA> MnPrv MnPrv <NA> <NA> <NA> <NA> <NA> <NA>
## [122] MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnWw GdPrv <NA>
## [133] <NA> <NA> MnPrv MnPrv <NA> <NA> GdPrv MnPrv <NA> <NA> MnPrv
## [144] <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> GdWo <NA>
## [155] <NA> <NA> <NA> <NA> GdWo <NA> <NA> <NA> <NA> MnPrv <NA>
## [166] <NA> GdWo <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv
## [177] <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> MnPrv GdPrv GdPrv
## [188] MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdPrv
## [199] MnPrv <NA> <NA> MnPrv <NA> <NA> MnWw <NA> <NA> GdWo <NA>
## [210] MnPrv <NA> <NA> <NA> <NA> MnPrv MnPrv <NA> <NA> <NA> <NA>
## [221] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA>
## [232] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA>
## [243] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA> <NA>
## [254] MnPrv <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA> GdWo <NA>
## [265] <NA> GdPrv <NA> <NA> <NA> GdPrv <NA> <NA> <NA> MnPrv <NA>
## [276] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [287] GdPrv <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> GdPrv MnPrv
## [298] <NA> <NA> GdPrv <NA> <NA> <NA> GdWo <NA> <NA> <NA> MnPrv
## [309] <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> GdPrv <NA> <NA>
## [320] <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> GdWo
## [331] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [342] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [353] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> MnPrv <NA> <NA>
## [364] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv GdWo
## [375] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [386] <NA> <NA> GdWo <NA> <NA> MnPrv <NA> MnPrv GdWo <NA> <NA>
## [397] <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> GdWo <NA>
## [408] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [419] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [430] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> MnPrv GdPrv
## [441] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA>
## [452] <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> MnPrv
## [463] GdWo <NA> <NA> <NA> GdWo <NA> <NA> <NA> <NA> <NA> <NA>
## [474] <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA>
## [485] <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA>
## [496] GdWo <NA> GdPrv MnPrv MnWw <NA> <NA> GdPrv GdWo <NA> <NA>
## [507] <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> MnPrv <NA> GdPrv
## [518] <NA> MnPrv MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [529] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnWw <NA>
## [540] MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA>
## [551] <NA> <NA> <NA> MnPrv <NA> <NA> MnWw <NA> <NA> <NA> <NA>
## [562] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [573] <NA> <NA> GdWo <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [584] <NA> <NA> <NA> MnWw <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [595] MnPrv <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA> <NA>
## [606] <NA> GdPrv <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> MnPrv
## [617] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo
## [628] <NA> <NA> GdPrv MnPrv <NA> <NA> <NA> GdPrv <NA> <NA> <NA>
## [639] MnPrv <NA> <NA> <NA> MnPrv GdWo <NA> <NA> <NA> <NA> <NA>
## [650] MnPrv <NA> MnPrv <NA> MnPrv <NA> <NA> MnPrv MnPrv <NA> MnPrv
## [661] <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [672] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [683] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [694] <NA> MnPrv <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> MnPrv
## [705] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA>
## [716] MnPrv <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [727] <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA> <NA> <NA>
## [738] <NA> <NA> <NA> GdPrv GdPrv GdPrv <NA> <NA> <NA> <NA> <NA>
## [749] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA>
## [760] <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA>
## [771] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA>
## [782] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [793] <NA> <NA> <NA> MnPrv MnPrv <NA> <NA> MnPrv <NA> <NA> <NA>
## [804] <NA> GdWo <NA> <NA> <NA> GdWo MnPrv GdPrv <NA> <NA> <NA>
## [815] <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> GdWo <NA>
## [826] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA>
## [837] GdWo <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA>
## [848] <NA> <NA> <NA> <NA> <NA> MnPrv MnPrv <NA> <NA> MnPrv <NA>
## [859] MnPrv <NA> GdPrv <NA> GdPrv GdPrv <NA> MnPrv <NA> <NA> <NA>
## [870] GdPrv <NA> <NA> GdWo <NA> <NA> <NA> <NA> <NA> MnPrv GdWo
## [881] <NA> <NA> MnPrv <NA> GdWo <NA> <NA> <NA> <NA> <NA> MnPrv
## [892] <NA> MnPrv GdPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [903] <NA> <NA> MnPrv MnPrv <NA> <NA> MnPrv <NA> <NA> <NA> MnPrv
## [914] <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> GdPrv <NA> <NA>
## [925] <NA> <NA> <NA> GdPrv <NA> <NA> <NA> GdWo <NA> <NA> <NA>
## [936] <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> MnWw
## [947] <NA> <NA> <NA> <NA> <NA> MnPrv MnPrv MnPrv <NA> <NA> GdPrv
## [958] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [969] GdWo <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA>
## [980] MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [991] <NA> <NA> MnWw <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA>
## [1002] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1013] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1024] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1035] MnPrv <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> MnPrv
## [1046] <NA> <NA> <NA> GdPrv <NA> <NA> <NA> GdPrv <NA> <NA> MnPrv
## [1057] <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv MnPrv <NA> <NA>
## [1068] <NA> GdPrv <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA>
## [1079] <NA> <NA> <NA> MnPrv <NA> MnPrv <NA> <NA> <NA> <NA> <NA>
## [1090] <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA>
## [1101] <NA> <NA> MnWw <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1112] <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> GdWo MnPrv <NA>
## [1123] MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv MnPrv <NA>
## [1134] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1145] MnPrv <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> GdPrv
## [1156] <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1167] <NA> <NA> <NA> <NA> GdPrv <NA> <NA> MnPrv <NA> <NA> MnPrv
## [1178] <NA> <NA> <NA> GdPrv <NA> MnPrv <NA> <NA> <NA> <NA> GdPrv
## [1189] <NA> <NA> <NA> <NA> GdWo <NA> MnPrv <NA> <NA> <NA> <NA>
## [1200] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> MnPrv <NA> <NA> <NA>
## [1211] GdPrv <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA>
## [1222] <NA> GdWo MnPrv <NA> MnWw <NA> <NA> <NA> GdWo <NA> GdWo
## [1233] <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv
## [1244] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo MnPrv
## [1255] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA>
## [1266] <NA> MnPrv <NA> GdWo GdWo <NA> <NA> MnPrv GdPrv <NA> <NA>
## [1277] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1288] <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA> GdWo MnPrv <NA>
## [1299] <NA> GdPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo
## [1310] GdWo <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1321] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> MnPrv MnPrv <NA> <NA>
## [1332] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA>
## [1343] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1354] <NA> <NA> GdPrv GdWo <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1365] <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA>
## [1376] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv MnPrv
## [1387] MnPrv GdWo <NA> <NA> <NA> <NA> MnWw <NA> <NA> <NA> <NA>
## [1398] <NA> GdPrv MnPrv <NA> <NA> <NA> <NA> MnPrv <NA> MnPrv MnPrv
## [1409] <NA> MnPrv <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1420] <NA> <NA> <NA> <NA> GdPrv GdWo <NA> <NA> <NA> MnPrv <NA>
## [1431] <NA> <NA> <NA> <NA> <NA> GdPrv GdWo <NA> MnPrv <NA> <NA>
## [1442] <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA> <NA> <NA>
## [1453] <NA> <NA> <NA> <NA> MnPrv GdPrv <NA> <NA> MnPrv <NA> MnPrv
## [1464] <NA> <NA> <NA> GdPrv <NA> <NA> MnPrv <NA> <NA> <NA> <NA>
## [1475] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1486] <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA>
## [1497] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1508] <NA> GdWo <NA> <NA> GdPrv <NA> <NA> GdWo <NA> <NA> <NA>
## [1519] MnPrv GdPrv <NA> GdWo <NA> <NA> GdWo <NA> <NA> GdWo <NA>
## [1530] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1541] <NA> MnPrv <NA> MnPrv MnPrv <NA> <NA> <NA> <NA> MnPrv <NA>
## [1552] GdPrv <NA> GdPrv <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA>
## [1563] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> MnPrv MnPrv <NA> <NA>
## [1574] <NA> MnPrv <NA> <NA> <NA> GdPrv <NA> MnPrv MnPrv <NA> <NA>
## [1585] <NA> <NA> MnPrv MnPrv <NA> <NA> MnPrv MnPrv <NA> <NA> <NA>
## [1596] <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> GdPrv
## [1607] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1618] <NA> MnPrv <NA> GdWo <NA> GdWo <NA> <NA> <NA> <NA> <NA>
## [1629] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1640] <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA> <NA> <NA> GdPrv
## [1651] GdWo <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1662] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv
## [1673] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1684] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA>
## [1695] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1706] GdPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1717] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1728] <NA> <NA> <NA> MnPrv <NA> <NA> MnPrv GdWo <NA> MnPrv GdPrv
## [1739] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1750] MnPrv <NA> GdPrv <NA> <NA> <NA> GdWo <NA> MnPrv <NA> MnPrv
## [1761] <NA> MnPrv <NA> MnPrv MnPrv GdPrv GdWo <NA> <NA> <NA> GdWo
## [1772] GdWo <NA> <NA> <NA> <NA> <NA> MnPrv MnPrv GdWo GdWo <NA>
## [1783] <NA> <NA> GdWo <NA> <NA> <NA> <NA> <NA> <NA> GdWo MnPrv
## [1794] <NA> <NA> <NA> <NA> GdWo <NA> GdWo <NA> <NA> <NA> <NA>
## [1805] <NA> <NA> <NA> MnPrv <NA> MnPrv <NA> <NA> MnPrv <NA> <NA>
## [1816] GdWo <NA> <NA> <NA> <NA> <NA> MnPrv <NA> GdWo <NA> <NA>
## [1827] MnPrv <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA>
## [1838] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA>
## [1849] <NA> GdPrv <NA> MnPrv <NA> <NA> <NA> MnPrv <NA> <NA> GdPrv
## [1860] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1871] <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> MnPrv <NA> <NA>
## [1882] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> MnPrv
## [1893] <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv GdPrv MnPrv <NA>
## [1904] MnPrv <NA> <NA> <NA> GdPrv GdPrv MnPrv <NA> GdPrv <NA> <NA>
## [1915] <NA> <NA> <NA> MnPrv GdWo <NA> <NA> <NA> <NA> <NA> <NA>
## [1926] <NA> MnPrv <NA> MnPrv <NA> GdPrv <NA> MnPrv <NA> <NA> <NA>
## [1937] <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA> <NA>
## [1948] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> MnPrv GdWo GdPrv
## [1959] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1970] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1981] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [1992] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2003] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2014] <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> GdPrv <NA>
## [2025] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2036] <NA> <NA> <NA> <NA> <NA> GdWo <NA> <NA> GdWo <NA> <NA>
## [2047] <NA> <NA> <NA> <NA> GdWo <NA> MnPrv <NA> MnPrv <NA> MnPrv
## [2058] MnPrv <NA> <NA> <NA> <NA> MnPrv <NA> GdWo <NA> <NA> <NA>
## [2069] GdPrv <NA> <NA> GdWo GdWo MnPrv <NA> <NA> <NA> <NA> GdWo
## [2080] GdWo MnWw <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA>
## [2091] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA>
## [2102] MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA>
## [2113] <NA> MnPrv GdPrv <NA> GdPrv <NA> <NA> MnPrv MnPrv <NA> <NA>
## [2124] MnPrv <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> MnPrv
## [2135] <NA> <NA> <NA> MnPrv <NA> MnPrv GdWo MnPrv <NA> <NA> <NA>
## [2146] <NA> <NA> MnPrv MnPrv <NA> <NA> <NA> MnPrv <NA> GdWo GdPrv
## [2157] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2168] <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA>
## [2179] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> MnPrv <NA>
## [2190] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> MnPrv
## [2201] <NA> GdPrv <NA> <NA> <NA> MnPrv <NA> <NA> MnPrv GdPrv MnPrv
## [2212] <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2223] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2234] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2245] GdWo GdWo <NA> GdWo <NA> <NA> <NA> <NA> <NA> MnPrv <NA>
## [2256] <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA>
## [2267] <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA>
## [2278] GdWo <NA> GdWo <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2289] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2300] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2311] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2322] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2333] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2344] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2355] <NA> <NA> <NA> GdPrv MnPrv <NA> <NA> <NA> <NA> <NA> <NA>
## [2366] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2377] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA>
## [2388] <NA> GdWo <NA> GdPrv <NA> MnPrv GdPrv MnPrv MnPrv <NA> MnPrv
## [2399] MnPrv MnPrv <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2410] <NA> GdWo <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> GdWo
## [2421] GdPrv MnPrv <NA> GdWo <NA> <NA> <NA> MnPrv <NA> <NA> <NA>
## [2432] <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA> MnPrv <NA>
## [2443] <NA> MnPrv <NA> MnPrv MnPrv <NA> <NA> <NA> MnPrv MnPrv <NA>
## [2454] <NA> <NA> <NA> <NA> GdWo MnPrv <NA> MnPrv GdPrv <NA> MnPrv
## [2465] <NA> <NA> GdPrv MnPrv MnPrv <NA> <NA> <NA> <NA> <NA> <NA>
## [2476] MnPrv MnPrv <NA> MnPrv <NA> MnPrv GdPrv <NA> <NA> MnPrv <NA>
## [2487] <NA> MnPrv <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA> <NA>
## [2498] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA> <NA>
## [2509] <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA> GdPrv <NA>
## [2520] <NA> <NA> <NA> <NA> <NA> MnPrv MnPrv MnPrv <NA> <NA> <NA>
## [2531] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2542] <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA> <NA> <NA>
## [2553] <NA> GdWo <NA> MnPrv <NA> <NA> MnPrv <NA> MnPrv <NA> MnPrv
## [2564] GdPrv <NA> <NA> MnPrv <NA> <NA> MnPrv <NA> <NA> GdWo <NA>
## [2575] MnPrv <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA>
## [2586] <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA> <NA>
## [2597] <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA> GdWo MnPrv MnPrv
## [2608] GdPrv GdPrv GdWo <NA> MnPrv MnPrv <NA> <NA> <NA> <NA> <NA>
## [2619] <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA>
## [2630] <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> GdWo
## [2641] <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA> <NA> <NA> <NA>
## [2652] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2663] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2674] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdPrv
## [2685] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2696] <NA> GdPrv <NA> MnPrv <NA> <NA> <NA> <NA> MnPrv MnPrv MnPrv
## [2707] <NA> MnPrv GdWo <NA> GdPrv <NA> <NA> <NA> <NA> <NA> GdPrv
## [2718] <NA> MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA>
## [2729] <NA> MnPrv MnPrv GdWo <NA> MnPrv <NA> MnPrv MnPrv <NA> <NA>
## [2740] <NA> GdPrv <NA> MnPrv <NA> <NA> MnPrv <NA> GdWo <NA> MnPrv
## [2751] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2762] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdWo <NA>
## [2773] <NA> <NA> <NA> <NA> <NA> MnPrv <NA> MnPrv <NA> <NA> <NA>
## [2784] <NA> MnPrv <NA> GdWo <NA> <NA> <NA> <NA> <NA> GdPrv <NA>
## [2795] <NA> <NA> <NA> <NA> MnPrv <NA> <NA> GdWo <NA> <NA> <NA>
## [2806] <NA> <NA> <NA> GdPrv GdWo MnPrv GdPrv <NA> <NA> <NA> <NA>
## [2817] <NA> MnPrv <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA>
## [2828] MnPrv <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2839] <NA> <NA> <NA> <NA> <NA> <NA> MnPrv <NA> <NA> <NA> <NA>
## [2850] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2861] <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA> MnPrv <NA>
## [2872] <NA> <NA> MnPrv <NA> MnPrv <NA> <NA> <NA> <NA> MnPrv <NA>
## [2883] <NA> GdWo MnPrv <NA> <NA> MnPrv GdWo MnPrv <NA> <NA> <NA>
## [2894] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## [2905] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> GdPrv <NA>
## [2916] <NA> <NA> MnPrv <NA>
## Levels: GdPrv GdWo MnPrv MnWw
all$Fence[is.na(all$Fence)]<-'None'
table(all$Fence)
##
## GdPrv GdWo MnPrv MnWw None
## 118 112 329 12 2348
ggplot(all[!is.na(SalePrice),],aes(Fence,SalePrice))+
geom_bar(stat='summary',fun.y='median', fill='blue')+
scale_y_continuous(breaks=seq(0,200000,50000),labels=comma)

detach("package:Rmisc", unload=TRUE)
detach("package:plyr", unload=TRUE)
all[!is.na(SalePrice),]%>% group_by(Fence)%>%summarise(median=median(SalePrice),counts=n())
all$Fence<-factor(all$Fence)
#Fence有無無顯著影響
Imputing missing data:Fire place
library(plyr)
table(all$FireplaceQu)
##
## Ex Fa Gd Po TA
## 43 74 744 46 592
all$FireplaceQu[is.na(all$FireplaceQu)]<-"None"
table(all$FireplaceQu)
##
## Ex Fa Gd None Po TA
## 43 74 744 1420 46 592
all$FireplaceQu<-as.integer(plyr::revalue(all$FireplaceQu,Qualities))
##########
table(all$Fireplaces)
##
## 0 1 2 3 4
## 1420 1268 219 11 1
sum(is.na(all$Fireplaces))
## [1] 0
class(all$Fireplaces)
## [1] "numeric"
Imputing missing data: Basement
NaNum
## function(df){
## sort(colSums(sapply(all[which(colSums(is.na(df))>0)],is.na)),decreasing = TRUE)
## }
length(which(is.na(all$BsmtQual)&is.na(all$BsmtCond)&is.na(all$BsmtExposure)&is.na(all$BsmtFinType1) &is.na(all$BsmtFinType2)))
## [1] 79
all[!is.na(all$BsmtFinType1) & (is.na(all$BsmtQual)|is.na(all$BsmtExposure)|is.na(all$BsmtFinType2)|is.na(all$BsmtCond)), c("BsmtQual","BsmtExposure","BsmtFinType2","BsmtCond")]
which(!is.na(all$BsmtFinType1) &(is.na(all$BsmtQual)|is.na(all$BsmtExposure)|is.na(all$BsmtFinType2)|is.na(all$BsmtCond)))
## [1] 333 949 1488 2041 2186 2218 2219 2349 2525
##So altogether, it seems as if there are 79 houses without a basement, because the basement variables of the other houses with missing values are all 80% complete (missing 1 out of 5 values). I am going to impute the modes to fix those 9 houses.
#Imputing modes.
all$BsmtFinType2[333]<-names(sort(-table(all$BsmtFinType2)))[1]
all$BsmtCond[c(2041,2186,2525)]<-names(sort(-table(all$BsmtCond)))[1]
all$BsmtExposure[c(949,1488,2349)]<-names(sort(-table(all$BsmtExposure)))[1]
all$BsmtQual[c(2218,2219)]<-names(sort(-table(all$BsmtQual)))[1]
######
all$BsmtQual[is.na(all$BsmtQual)]<-"None"
all$BsmtQual<-as.integer(plyr::revalue(all$BsmtQual,Qualities))
table(all$BsmtQual)
##
## 0 2 3 4 5
## 79 88 1285 1209 258
###
all$BsmtCond[is.na(all$BsmtCond)]<-'None'
all$BsmtCond<-as.integer(plyr::revalue(all$BsmtCond,Qualities))
table(all$BsmtCond)
##
## 0 1 2 3 4
## 79 5 104 2609 122
###
all$BsmtExposure[is.na(all$BsmtExposure)]<-'None'
table(all$BsmtExposure)
##
## Av Gd Mn No None
## 418 276 239 1907 79
Exposure<-c('None'=0,'No'=1, 'Mn'=2,'Gd'=3,'Av'=4)
all$BsmtExposure<-as.integer(plyr::revalue(all$BsmtExposure,Exposure))
###
all$BsmtFinType1[is.na(all$BsmtFinType1)]<-'None'
FinType<-c('None'=0,'Unf'=1,'LwQ'=2,'Rec'=3,'BLQ'=4,'ALQ'=5,'GLQ'=6)
all$BsmtFinType1<-as.integer(plyr::revalue(all$BsmtFinType1,FinType))
table(all$BsmtFinType1)
##
## 0 1 2 3 4 5 6
## 79 851 154 288 269 429 849
###
all$BsmtFinType2[is.na(all$BsmtFinType2)]<-'None'
all$BsmtFinType2<-as.integer(plyr::revalue(all$BsmtFinType2,FinType))
table(all$BsmtFinType2)
##
## 0 1 2 3 4 5 6
## 79 2494 87 105 68 52 34
all[(is.na(all$BsmtUnfSF)|is.na(all$BsmtFinSF2)|is.na(all$BsmtFinSF1)|is.na(all$BsmtHalfBath)|is.na(all$BsmtFullBath)|is.na(all$TotalBsmtSF)),c('BsmtUnfSF','BsmtFinSF2','BsmtFinSF1','BsmtHalfBath','BsmtFullBath','TotalBsmtSF')]
which(is.na(all$BsmtUnfSF)|is.na(all$BsmtFinSF2)|is.na(all$BsmtFinSF1)|is.na(all$BsmtHalfBath)|is.na(all$BsmtFullBath)|is.na(all$TotalBsmtSF))
## [1] 2121 2189
all[c(2121,2189),c('BsmtUnfSF','BsmtFinSF2','BsmtFinSF1','BsmtHalfBath','BsmtFullBath','TotalBsmtSF')]
table(all$TotalBsmtSF)
##
## 0 105 160 173 190 192 216 240 245 264 270 279 290 297 301
## 78 1 1 1 1 1 2 1 1 4 1 1 1 1 1
## 319 346 348 352 356 360 370 372 381 384 385 392 396 405 407
## 1 2 1 1 1 1 1 1 1 19 1 2 1 1 1
## 408 409 416 420 423 430 432 440 448 450 451 456 458 462 468
## 1 1 2 1 1 1 1 3 2 1 1 2 1 2 1
## 480 481 482 483 484 485 492 494 498 502 504 506 516 520 525
## 2 1 1 14 1 1 1 1 1 1 2 1 3 5 5
## 526 528 530 531 533 536 539 540 544 546 547 550 551 552 554
## 1 4 4 1 2 2 1 2 1 16 6 1 1 1 1
## 559 560 561 565 569 570 572 576 583 585 588 592 596 600 602
## 1 5 3 1 1 3 7 5 1 6 4 2 3 16 1
## 608 610 611 612 616 617 621 622 624 625 626 628 629 630 631
## 5 1 1 3 6 1 1 1 12 2 1 1 1 13 1
## 635 636 637 641 644 648 649 650 651 654 655 656 658 660 661
## 1 2 3 2 2 2 2 3 2 1 1 1 1 7 1
## 662 663 666 671 672 673 675 676 677 678 680 683 684 686 687
## 2 3 2 1 29 1 2 6 1 1 6 1 7 7 1
## 689 690 691 693 696 697 698 699 700 702 704 707 708 709 710
## 6 2 5 1 1 3 7 1 2 3 4 4 1 2 1
## 712 713 714 715 716 718 720 721 723 725 727 728 729 731 732
## 3 2 1 5 4 1 16 1 3 6 1 20 3 2 5
## 734 735 736 738 739 740 741 742 744 745 746 747 750 752 754
## 1 4 2 9 1 2 4 3 6 1 1 6 3 2 2
## 755 756 757 760 761 763 764 765 768 770 771 773 774 776 777
## 3 17 1 1 1 1 5 4 24 3 2 2 1 2 2
## 778 779 780 781 782 783 784 788 789 791 792 793 794 795 796
## 1 1 19 2 4 3 11 3 3 1 2 7 3 3 11
## 797 798 799 800 801 802 803 804 806 807 808 810 811 812 813
## 1 5 3 2 2 1 2 6 4 1 5 3 1 2 3
## 814 815 816 817 818 819 821 822 824 825 826 827 828 830 831
## 2 1 23 2 2 2 1 2 3 6 1 2 3 1 3
## 832 833 835 836 837 839 840 841 842 844 845 846 847 848 849
## 17 5 5 6 1 1 13 1 1 2 7 2 7 16 1
## 850 851 852 853 854 855 856 858 859 860 861 862 864 866 868
## 2 2 1 2 3 9 5 9 2 4 6 1 74 3 4
## 869 870 872 873 874 876 878 879 880 882 883 884 886 888 890
## 1 3 3 1 2 11 1 2 5 7 1 10 1 2 1
## 891 892 893 894 895 896 900 901 902 903 904 907 908 910 911
## 1 2 1 17 1 8 4 6 1 1 1 4 2 7 2
## 912 913 914 915 916 917 918 920 922 923 924 925 926 927 928
## 26 2 4 3 4 1 4 6 1 4 5 8 7 1 9
## 929 930 931 932 935 936 938 939 940 941 942 943 944 945 946
## 2 5 2 3 2 16 5 2 3 5 2 4 3 5 1
## 948 949 950 951 952 953 954 955 956 957 958 959 960 961 963
## 5 3 2 4 13 2 2 3 1 2 2 1 18 1 1
## 964 965 966 967 968 969 970 971 972 973 974 975 976 978 979
## 2 1 2 5 1 3 8 1 7 3 1 5 6 3 2
## 980 981 982 983 984 985 988 989 990 991 992 993 994 995 996
## 6 3 5 1 6 2 12 1 8 2 8 1 4 2 3
## 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1010 1012 1013 1014
## 2 1 2 3 1 1 4 4 1 1 19 3 2 2 1
## 1015 1017 1019 1020 1021 1022 1024 1025 1026 1027 1028 1029 1030 1031 1032
## 2 2 1 3 1 5 5 1 5 2 2 5 1 1 5
## 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
## 1 1 1 2 1 1 25 3 2 2 2 2 1 1 3
## 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063
## 2 5 4 5 4 7 4 10 6 4 2 2 1 2 3
## 1064 1065 1066 1067 1068 1069 1070 1072 1073 1074 1075 1076 1077 1078 1079
## 6 4 1 2 2 3 2 1 8 2 2 1 2 6 2
## 1080 1081 1082 1083 1084 1085 1086 1088 1089 1090 1091 1092 1093 1094 1095
## 7 1 3 1 2 1 3 4 1 5 1 7 2 5 3
## 1096 1097 1098 1099 1100 1103 1104 1105 1106 1107 1108 1109 1112 1113 1114
## 2 3 1 2 7 2 6 5 1 4 3 1 2 3 6
## 1116 1117 1118 1120 1121 1122 1124 1125 1126 1127 1128 1129 1130 1131 1132
## 2 5 1 3 3 4 3 1 2 4 4 1 1 1 1
## 1134 1135 1136 1138 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
## 2 1 2 3 4 5 4 5 6 5 1 1 3 2 4
## 1151 1152 1153 1154 1156 1157 1158 1160 1161 1162 1163 1164 1165 1166 1168
## 3 6 3 1 2 1 6 4 1 2 1 3 3 1 7
## 1169 1170 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1184 1187
## 2 1 1 2 1 3 6 1 1 2 1 3 1 2 2
## 1188 1189 1190 1191 1192 1193 1194 1195 1196 1198 1199 1200 1202 1203 1204
## 5 1 3 2 1 1 1 2 4 1 2 7 2 1 4
## 1205 1206 1208 1209 1210 1211 1212 1214 1215 1216 1217 1218 1219 1220 1221
## 1 1 7 2 1 1 4 4 1 7 1 5 1 5 1
## 1222 1223 1224 1225 1226 1228 1230 1231 1232 1234 1235 1236 1237 1240 1241
## 1 2 3 1 2 4 1 1 4 2 2 1 3 4 1
## 1242 1243 1244 1246 1247 1248 1249 1250 1251 1252 1253 1254 1256 1257 1258
## 4 1 3 3 1 6 2 1 2 2 1 1 4 2 3
## 1259 1260 1261 1262 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274
## 1 2 1 2 2 1 2 2 3 1 1 1 2 2 1
## 1276 1277 1278 1280 1281 1282 1284 1286 1288 1290 1291 1292 1293 1295 1296
## 2 2 2 3 1 1 3 2 4 2 1 1 1 1 3
## 1297 1298 1299 1300 1302 1304 1306 1308 1309 1310 1311 1312 1313 1314 1316
## 1 2 1 3 6 2 3 1 1 3 1 2 4 4 2
## 1317 1318 1319 1324 1325 1326 1328 1329 1330 1331 1332 1334 1335 1336 1337
## 1 1 2 4 1 2 1 2 2 1 2 2 1 2 4
## 1338 1339 1340 1341 1342 1344 1346 1347 1348 1349 1350 1351 1352 1356 1357
## 3 1 1 2 4 4 2 2 2 1 4 1 2 1 1
## 1358 1360 1361 1362 1363 1364 1365 1367 1368 1369 1370 1372 1373 1374 1375
## 3 2 1 7 1 2 2 2 3 1 3 4 2 1 1
## 1376 1377 1378 1379 1380 1381 1382 1383 1385 1386 1388 1389 1390 1391 1392
## 1 1 1 1 1 1 1 2 1 2 1 2 4 2 6
## 1393 1394 1395 1396 1398 1400 1401 1402 1405 1406 1408 1409 1410 1413 1414
## 2 1 2 2 3 1 1 1 6 2 2 1 1 1 2
## 1415 1416 1417 1418 1419 1420 1422 1424 1425 1426 1427 1428 1430 1431 1432
## 1 1 2 2 2 4 4 1 2 1 2 1 3 2 1
## 1433 1434 1436 1437 1438 1440 1441 1442 1444 1445 1449 1450 1451 1452 1453
## 1 2 3 2 1 5 1 2 3 1 1 1 1 1 4
## 1454 1455 1459 1460 1461 1462 1463 1466 1468 1470 1473 1475 1476 1477 1478
## 2 1 1 1 3 3 2 3 4 3 1 1 1 1 3
## 1479 1480 1482 1484 1485 1486 1487 1488 1489 1490 1491 1492 1494 1495 1496
## 2 1 3 2 1 1 1 4 3 1 2 2 6 1 2
## 1498 1499 1500 1501 1502 1504 1505 1508 1509 1510 1511 1512 1516 1517 1518
## 3 1 1 3 2 3 1 2 1 3 1 1 1 2 3
## 1519 1520 1522 1524 1525 1528 1529 1530 1531 1533 1536 1538 1540 1541 1542
## 1 1 1 3 1 3 1 3 1 1 1 1 1 2 2
## 1544 1546 1550 1552 1553 1554 1555 1556 1557 1559 1560 1561 1562 1563 1564
## 1 2 1 1 1 2 1 2 1 1 5 2 1 1 1
## 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1577 1578 1580 1581
## 1 2 1 4 1 1 1 2 5 2 1 1 2 4 1
## 1582 1584 1587 1588 1590 1592 1593 1594 1595 1596 1598 1600 1602 1603 1604
## 4 3 2 2 1 1 1 5 3 2 2 1 3 1 3
## 1606 1610 1612 1614 1615 1616 1617 1618 1620 1621 1622 1623 1624 1625 1626
## 2 1 1 5 1 2 2 1 3 1 1 1 2 3 3
## 1629 1630 1632 1638 1641 1642 1643 1645 1649 1650 1652 1654 1656 1657 1660
## 2 1 3 2 1 1 3 1 1 2 3 1 2 2 2
## 1664 1666 1670 1671 1673 1675 1678 1679 1680 1682 1683 1684 1685 1686 1689
## 2 2 2 1 2 1 1 1 5 2 1 1 1 4 2
## 1694 1696 1698 1700 1702 1704 1705 1706 1709 1710 1712 1713 1714 1719 1720
## 4 2 3 2 2 2 2 1 1 3 1 1 1 1 2
## 1721 1722 1726 1728 1732 1733 1734 1736 1738 1739 1740 1742 1746 1748 1751
## 2 1 2 8 1 1 3 1 1 1 2 2 1 1 1
## 1752 1753 1760 1763 1765 1766 1768 1774 1776 1777 1778 1779 1780 1782 1790
## 1 2 1 1 1 1 1 1 2 1 1 1 1 2 1
## 1792 1795 1800 1802 1803 1809 1814 1822 1824 1829 1832 1833 1836 1838 1840
## 2 1 5 1 1 1 1 3 2 1 1 1 4 4 1
## 1842 1844 1845 1848 1850 1851 1856 1858 1860 1865 1866 1868 1869 1884 1896
## 1 3 1 2 2 1 1 1 1 1 1 2 1 3 1
## 1898 1902 1905 1907 1910 1921 1922 1926 1930 1932 1934 1935 1947 1949 1950
## 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1
## 1952 1958 1964 1966 1967 1976 1978 1980 1982 1986 1992 1994 2000 2002 2006
## 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1
## 2014 2020 2024 2033 2035 2036 2042 2046 2048 2062 2076 2077 2078 2108 2109
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 2110 2121 2136 2140 2153 2158 2171 2190 2200 2208 2216 2217 2220 2223 2271
## 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1
## 2320 2330 2392 2396 2418 2444 2452 2458 2461 2492 2524 2535 2552 2630 2633
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 2660 2846 3094 3138 3200 3206 5095 6110
## 1 1 1 1 1 1 1 1
all$TotalBsmtSF[c(2121,2189)]<-0
table(all$BsmtFullBath)
##
## 0 1 2 3
## 1705 1172 38 2
all$BsmtFullBath[c(2121,2189)]<-0
table(all$BsmtHalfBath)
##
## 0 1 2
## 2742 171 4
all$BsmtHalfBath[c(2121,2189)]<-0
table(all$BsmtFinSF1)
##
## 0 2 16 20 24 25 27 28 32 33 35 36 40 41 42
## 929 1 14 8 27 1 1 5 1 1 1 4 3 1 1
## 48 49 50 51 52 53 54 55 56 57 60 63 64 65 68
## 4 1 2 1 2 1 2 1 4 2 4 1 1 1 3
## 70 72 73 75 76 77 78 80 81 85 88 94 96 100 104
## 2 1 1 1 1 1 1 4 2 1 1 1 1 1 5
## 108 110 111 113 114 116 119 120 121 122 125 126 128 129 130
## 1 3 2 1 3 3 3 6 2 1 1 1 1 1 3
## 131 132 133 134 138 140 141 143 144 148 149 150 152 154 155
## 1 1 2 1 3 1 1 1 5 1 1 3 2 1 1
## 156 162 165 167 168 169 170 172 173 175 176 179 180 181 182
## 3 2 1 1 5 1 1 1 2 1 3 1 3 1 3
## 185 186 187 188 189 190 191 192 193 194 196 197 198 200 201
## 1 1 2 2 3 2 1 4 1 2 5 1 2 1 1
## 203 204 205 206 207 208 209 210 212 213 215 216 218 219 220
## 3 1 1 1 1 1 2 4 1 1 1 3 1 1 3
## 221 222 223 224 225 226 228 230 231 234 236 238 239 240 241
## 1 1 1 3 1 2 2 1 2 3 2 1 1 3 2
## 242 244 246 247 248 249 250 251 252 254 256 257 258 259 260
## 1 1 2 4 2 2 5 1 5 1 3 3 1 2 2
## 261 262 263 264 266 267 270 271 273 274 275 276 278 279 280
## 1 2 1 3 2 2 1 1 2 2 2 6 1 1 5
## 281 282 283 284 285 286 288 290 292 294 296 297 298 299 300
## 2 2 2 3 2 1 8 2 3 3 1 3 2 4 9
## 301 305 306 308 309 310 311 312 314 315 316 317 318 319 320
## 2 2 3 3 1 3 3 6 1 3 1 2 1 4 2
## 321 322 324 326 328 329 330 331 332 334 335 336 337 338 339
## 2 1 2 2 1 4 5 2 1 2 1 5 1 4 1
## 340 341 342 343 344 346 347 348 349 350 351 352 353 354 355
## 3 3 2 2 1 1 1 2 1 4 3 2 5 2 1
## 356 358 360 361 362 363 364 365 366 368 369 370 371 372 373
## 1 2 7 2 1 1 1 1 1 6 1 3 3 2 1
## 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
## 2 7 1 1 4 3 2 3 1 1 8 2 2 2 2
## 389 390 392 393 394 397 398 399 400 402 403 404 405 406 408
## 1 4 2 1 1 3 1 3 5 2 4 1 2 2 4
## 410 412 414 415 416 417 419 420 421 422 423 424 425 426 427
## 4 1 3 2 2 2 2 3 2 1 1 1 4 2 1
## 428 429 430 432 433 434 435 436 437 438 439 440 441 442 443
## 4 1 1 5 1 1 3 1 3 2 1 1 2 4 1
## 444 445 448 450 452 453 454 455 456 457 458 459 460 462 464
## 2 3 2 5 1 2 3 1 7 3 2 1 3 3 1
## 465 466 467 468 469 471 472 474 475 476 477 480 481 482 483
## 1 1 1 6 1 2 2 3 1 4 1 4 1 1 4
## 484 485 486 488 489 490 491 492 493 494 495 496 497 498 500
## 2 2 3 1 2 4 1 3 2 3 4 1 1 2 7
## 501 502 503 504 505 506 507 509 510 512 513 514 515 516 518
## 1 1 3 6 3 2 4 2 4 2 3 1 1 3 1
## 520 521 522 523 524 526 527 528 531 532 533 534 535 536 537
## 4 2 3 2 2 1 3 6 2 3 3 2 2 2 1
## 538 539 540 543 544 546 547 548 549 550 551 552 553 554 556
## 1 4 3 1 6 4 6 3 4 5 1 3 6 2 1
## 557 559 560 562 564 565 566 567 568 569 570 572 573 574 575
## 1 1 6 1 2 4 2 2 2 3 3 1 1 3 2
## 576 577 578 579 580 583 584 585 586 587 588 590 592 593 594
## 2 1 3 1 2 2 2 2 1 1 6 1 1 2 3
## 595 596 599 600 601 602 603 604 605 606 607 608 609 611 612
## 5 1 2 8 1 7 3 2 1 2 1 2 2 2 4
## 614 615 616 617 619 620 621 622 623 624 625 626 630 631 632
## 1 1 5 2 2 1 2 2 2 7 6 4 2 1 3
## 633 634 636 637 638 639 641 642 643 644 645 646 647 648 649
## 4 1 2 5 2 2 4 1 3 3 1 3 1 5 2
## 650 651 652 654 655 656 658 659 660 661 662 663 664 666 668
## 2 2 1 2 4 4 3 4 4 1 6 4 2 3 2
## 669 670 671 672 673 674 678 679 680 681 682 683 684 685 686
## 1 1 1 5 2 3 2 2 3 2 2 1 1 4 5
## 687 688 689 690 691 692 694 695 696 697 698 699 700 701 702
## 1 1 1 3 1 1 1 3 1 5 3 2 7 1 1
## 704 705 706 708 709 710 712 713 714 716 717 718 719 720 722
## 4 2 3 2 1 1 3 1 1 2 2 1 2 2 1
## 724 725 726 727 728 729 731 732 733 734 735 736 737 738 739
## 2 1 1 2 3 2 1 5 3 4 2 2 2 2 3
## 740 741 742 744 745 746 747 748 749 750 751 755 756 758 759
## 2 2 1 2 2 1 4 2 1 1 1 3 1 5 1
## 760 762 763 764 765 766 767 769 770 771 772 773 774 775 776
## 2 3 2 1 2 2 5 3 2 1 1 3 2 3 3
## 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
## 2 3 3 3 4 1 1 5 2 3 2 4 3 2 2
## 792 793 794 795 796 797 799 800 803 804 806 808 809 810 811
## 1 2 1 1 2 1 2 3 2 2 1 1 1 2 1
## 812 813 814 816 819 820 821 822 824 826 827 828 830 831 832
## 4 2 2 4 2 3 2 2 2 1 1 4 1 2 3
## 833 836 837 838 840 841 842 844 846 847 848 850 851 852 853
## 2 4 1 1 1 4 1 1 3 2 1 2 4 1 1
## 854 856 859 860 862 863 864 865 866 867 870 871 872 873 874
## 1 3 1 2 1 1 4 2 3 1 3 2 3 1 1
## 876 880 881 883 885 888 890 892 893 894 895 896 897 899 900
## 2 1 1 1 1 1 2 1 1 2 1 1 1 1 1
## 901 902 903 904 905 906 908 909 910 912 913 914 915 916 918
## 2 4 2 5 1 1 1 1 2 1 2 1 5 1 1
## 919 920 922 923 924 925 926 929 930 931 932 935 936 937 938
## 1 3 2 2 1 2 1 2 2 2 2 1 7 1 2
## 939 941 943 944 945 946 949 950 951 952 953 954 955 956 958
## 1 2 1 2 2 1 1 1 1 1 1 1 1 2 2
## 960 962 964 965 967 968 969 970 973 975 976 978 980 982 983
## 1 3 3 1 2 1 1 1 1 2 2 1 2 1 1
## 984 985 986 987 988 990 991 994 996 998 1000 1001 1002 1003 1004
## 1 1 4 1 4 1 1 2 2 3 4 1 1 1 2
## 1005 1010 1011 1012 1013 1014 1015 1016 1018 1021 1022 1023 1024 1026 1027
## 3 1 1 2 1 1 1 2 2 1 1 3 2 1 1
## 1029 1030 1032 1033 1034 1035 1036 1037 1038 1039 1040 1044 1046 1047 1048
## 1 2 2 1 1 1 4 1 1 2 2 1 1 1 1
## 1051 1053 1056 1059 1064 1065 1070 1071 1073 1074 1075 1078 1079 1080 1082
## 1 2 3 4 1 2 2 1 1 1 1 2 1 1 2
## 1084 1085 1086 1087 1088 1090 1092 1094 1096 1097 1098 1101 1104 1106 1110
## 3 1 1 1 2 1 1 1 1 1 1 1 2 1 2
## 1111 1112 1115 1116 1118 1121 1122 1124 1126 1127 1128 1129 1136 1137 1138
## 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
## 1141 1142 1148 1149 1150 1151 1152 1153 1154 1157 1158 1159 1162 1163 1165
## 1 1 3 1 1 1 2 3 1 1 1 2 1 1 1
## 1170 1172 1173 1178 1180 1181 1182 1186 1188 1191 1194 1196 1198 1200 1201
## 1 1 3 1 1 1 1 1 1 1 1 2 2 5 3
## 1204 1206 1213 1216 1218 1219 1220 1223 1224 1225 1230 1231 1232 1234 1236
## 1 2 1 1 4 2 2 1 1 1 1 1 2 2 1
## 1237 1238 1239 1243 1246 1247 1249 1252 1258 1259 1260 1261 1262 1270 1271
## 1 1 1 1 2 1 3 1 2 1 1 1 1 1 1
## 1274 1277 1280 1282 1283 1285 1288 1290 1294 1298 1300 1302 1304 1308 1309
## 3 1 1 1 1 1 2 1 1 1 4 1 1 1 3
## 1312 1314 1319 1320 1324 1328 1329 1330 1332 1333 1334 1336 1337 1338 1341
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1346 1350 1351 1358 1359 1360 1361 1369 1373 1375 1383 1386 1387 1390 1392
## 1 2 1 1 1 2 1 2 3 1 2 2 2 1 1
## 1398 1400 1406 1410 1412 1414 1416 1420 1422 1430 1433 1436 1440 1441 1443
## 1 2 1 2 1 1 2 1 1 2 1 1 1 1 1
## 1445 1447 1455 1456 1460 1464 1470 1474 1476 1478 1480 1500 1505 1513 1518
## 1 1 2 1 1 1 1 1 2 1 1 1 1 1 2
## 1531 1538 1540 1557 1558 1562 1564 1567 1571 1572 1573 1576 1593 1606 1619
## 1 1 1 1 1 1 1 2 1 2 3 1 1 1 1
## 1632 1636 1640 1646 1660 1682 1684 1696 1721 1728 1732 1733 1758 1767 1810
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1812 1836 1880 1904 1965 1972 2085 2096 2146 2158 2188 2257 2260 2288 4010
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 5644
## 1
all$BsmtFinSF1[c(2121,2189)]<-0
table(all$BsmtFinSF2)
##
## 0 6 12 28 32 35 38 40 41 42 46 48 52 60 63
## 2571 1 1 1 1 1 1 2 2 2 1 1 1 2 1
## 64 66 68 72 76 78 80 81 92 93 95 96 102 105 106
## 2 1 2 2 1 1 2 1 1 2 1 2 1 2 1
## 108 110 113 116 117 119 120 121 123 125 127 128 132 136 138
## 2 2 1 2 2 1 1 2 1 1 2 2 1 1 1
## 139 144 147 149 150 153 154 156 159 162 163 165 167 168 169
## 1 3 3 1 1 1 1 1 2 3 1 1 1 3 1
## 173 174 175 177 180 181 182 184 186 193 196 201 202 206 208
## 1 2 1 1 5 1 2 1 1 1 1 1 2 1 1
## 210 211 215 216 219 227 228 230 232 239 240 243 247 250 252
## 2 1 1 1 1 1 1 1 1 1 2 1 2 1 2
## 258 259 262 263 264 270 273 276 278 279 281 284 286 287 288
## 1 1 1 1 2 2 2 2 1 2 1 1 1 2 2
## 290 294 297 306 308 311 319 321 324 334 336 337 344 345 350
## 2 5 1 1 1 1 1 1 1 1 1 1 2 1 1
## 351 352 354 360 362 364 373 374 375 377 380 382 387 391 393
## 1 1 2 1 1 1 1 3 1 1 1 1 1 2 1
## 396 398 400 402 404 411 417 419 420 432 435 438 441 442 448
## 1 1 2 1 1 1 1 1 1 1 3 1 1 1 1
## 449 450 453 456 465 466 468 469 472 474 479 480 483 486 488
## 1 1 1 1 2 1 2 2 1 1 1 2 3 1 1
## 491 492 495 497 499 500 506 507 512 522 529 530 531 532 539
## 1 2 2 1 1 1 1 1 1 1 1 1 1 1 3
## 543 544 546 547 551 555 557 580 590 596 600 604 606 608 612
## 1 1 1 1 2 1 1 1 2 2 1 1 1 1 1
## 613 619 620 622 624 627 630 634 645 661 668 670 679 682 684
## 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1
## 688 690 691 692 694 712 713 722 723 748 750 755 761 764 768
## 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1
## 774 791 799 811 820 823 826 829 831 841 842 850 852 859 869
## 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## 872 873 875 884 891 904 906 912 915 955 956 972 981 982 1020
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1029 1031 1037 1039 1057 1061 1063 1073 1080 1083 1085 1120 1127 1164 1393
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1474 1526
## 1 1
all$BsmtFinSF2[c(2121,2189)]<-0
table(all$BsmtUnfSF)
##
## 0 14 15 17 20 22 23 25 26 27 28 29 30 32 33
## 241 1 1 1 1 1 2 3 1 1 1 1 6 2 1
## 34 35 36 38 39 40 42 45 46 48 50 52 53 54 55
## 1 2 5 1 1 1 1 1 3 2 1 2 1 2 3
## 56 57 58 60 61 63 64 70 72 74 75 76 77 78 79
## 2 1 1 2 1 1 2 2 4 1 4 4 2 3 1
## 80 81 82 83 84 86 88 89 90 91 92 93 94 95 96
## 7 1 1 3 2 1 6 3 5 1 4 3 2 2 5
## 98 99 100 102 103 104 105 106 107 108 109 110 111 112 113
## 3 3 11 3 2 2 3 1 2 8 1 2 2 3 2
## 114 115 117 118 119 120 121 122 123 124 125 126 127 128 129
## 3 5 2 1 1 7 3 3 1 2 6 3 1 1 1
## 130 131 132 133 134 135 136 137 138 140 141 142 143 144 147
## 5 2 6 3 2 4 2 2 5 5 3 3 4 3 1
## 148 149 150 151 153 154 155 156 158 160 161 162 163 164 165
## 3 1 6 2 3 1 1 2 2 6 3 8 6 1 2
## 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 5 3 6 2 2 3 5 4 3 4 4 2 1 3 5
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
## 1 2 1 3 1 9 2 4 3 6 3 5 3 1 6
## 196 197 198 200 201 203 204 206 207 208 210 212 213 214 215
## 2 4 2 6 4 3 4 3 3 4 6 5 3 1 2
## 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
## 11 2 4 3 4 2 3 3 6 3 5 1 5 3 3
## 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
## 1 4 1 3 4 1 2 3 1 6 1 3 1 3 5
## 246 247 248 249 250 251 252 253 254 255 256 257 258 260 261
## 1 3 5 1 5 2 7 4 4 2 2 1 1 1 5
## 262 263 264 265 266 268 269 270 271 272 273 274 275 276 277
## 2 2 9 3 1 4 1 10 1 1 2 1 1 6 2
## 278 279 280 281 282 284 285 286 288 289 290 291 292 293 294
## 6 3 9 2 4 5 3 4 4 2 2 2 6 2 8
## 295 296 297 298 299 300 301 303 304 305 306 307 308 309 310
## 3 1 4 2 3 10 3 1 3 1 5 1 4 1 3
## 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
## 2 6 1 1 1 6 3 4 7 3 3 7 2 7 3
## 326 327 328 329 330 331 332 333 334 335 336 338 339 340 341
## 6 1 3 3 2 2 2 3 1 1 7 2 2 5 3
## 342 343 344 345 346 348 349 350 352 354 355 356 357 358 359
## 6 3 2 2 4 8 1 5 2 4 2 7 3 1 2
## 360 361 362 363 364 365 366 367 369 370 371 372 374 375 378
## 5 1 1 4 2 2 2 3 2 3 3 3 2 2 4
## 379 380 381 382 383 384 385 386 387 388 389 390 392 393 394
## 3 4 4 1 2 19 1 4 1 1 1 6 7 1 1
## 395 396 397 398 399 400 402 403 404 405 406 407 408 409 410
## 2 9 3 3 4 4 4 3 5 6 3 1 6 2 6
## 411 412 413 414 415 416 417 418 420 422 423 424 425 426 427
## 3 2 3 2 3 6 1 1 5 5 2 3 3 5 3
## 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
## 1 2 3 3 6 1 5 1 2 2 2 1 8 4 1
## 443 444 445 447 448 449 450 451 452 453 454 455 456 457 458
## 2 2 1 1 7 2 4 1 1 1 1 4 6 3 1
## 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
## 3 6 3 4 1 1 1 4 3 2 1 3 1 1 1
## 474 475 476 477 479 480 481 482 483 484 485 486 487 488 490
## 5 2 5 1 2 8 2 4 3 5 2 3 2 1 6
## 491 494 495 496 497 498 499 500 501 502 503 504 505 506 507
## 4 2 2 4 2 3 2 2 3 1 4 5 1 3 2
## 508 510 512 513 515 516 517 519 520 521 522 523 524 525 526
## 2 3 1 3 1 4 1 1 4 1 4 1 2 7 2
## 527 528 529 530 532 533 534 535 536 537 538 539 540 541 542
## 1 2 1 2 1 4 2 1 6 1 3 1 4 1 1
## 543 544 545 546 547 548 549 550 551 552 554 555 556 557 559
## 2 4 3 8 1 3 2 3 1 4 4 1 5 1 1
## 560 561 563 564 565 567 568 569 570 571 572 574 575 576 577
## 4 1 1 4 1 2 1 3 6 1 11 1 2 6 2
## 578 579 580 581 583 584 585 586 587 588 589 590 591 592 593
## 1 1 1 1 1 1 6 2 1 4 1 4 1 2 1
## 594 595 596 598 599 600 602 604 605 606 608 610 611 612 613
## 4 3 5 5 3 12 3 3 2 1 2 2 4 3 1
## 615 616 618 619 620 621 622 624 625 626 628 630 631 632 634
## 1 4 2 1 1 1 2 10 6 2 4 7 1 2 2
## 635 636 637 638 639 640 641 643 644 645 646 647 648 649 650
## 2 2 2 5 2 2 5 1 2 1 1 1 1 2 2
## 651 652 655 656 657 658 659 660 661 662 663 664 666 667 668
## 1 2 1 3 1 2 1 6 3 4 2 2 3 1 1
## 671 672 673 674 675 676 677 678 679 680 683 684 686 689 690
## 2 13 2 1 2 6 2 3 1 2 1 6 5 4 1
## 691 692 693 697 698 700 701 702 704 705 706 707 708 709 710
## 1 2 1 4 6 3 1 3 2 1 2 2 1 1 4
## 712 713 715 716 717 718 720 721 722 723 724 725 726 727 728
## 3 2 2 2 2 2 4 1 2 3 2 4 1 1 14
## 729 730 731 732 733 734 735 736 738 739 740 741 742 744 745
## 1 1 1 3 1 1 1 4 7 2 2 4 4 4 1
## 746 747 748 749 750 752 755 756 758 760 761 762 763 764 765
## 2 5 2 1 2 3 2 4 1 1 1 2 2 3 2
## 767 768 769 770 771 772 773 774 776 777 778 779 780 781 782
## 1 9 2 2 1 1 1 1 2 1 3 1 8 2 2
## 783 784 785 788 789 791 792 793 794 795 796 797 798 799 801
## 2 7 5 2 2 1 2 2 2 1 5 2 3 1 3
## 802 803 804 806 808 809 810 811 812 813 814 815 816 817 818
## 1 1 1 2 4 1 3 3 1 2 1 4 11 2 2
## 819 821 823 825 827 828 830 831 832 833 834 835 836 840 841
## 1 2 1 2 1 1 1 3 8 2 1 2 1 8 1
## 844 845 846 847 848 849 850 851 852 856 858 859 860 861 862
## 3 1 4 7 2 4 3 3 1 1 2 1 2 2 1
## 863 864 866 868 869 871 872 873 874 876 877 878 879 880 882
## 1 4 1 3 1 1 2 1 1 2 1 2 2 3 1
## 884 888 889 891 892 893 894 896 897 898 899 901 903 905 906
## 6 1 1 2 2 1 5 4 1 1 1 4 1 1 1
## 907 908 910 912 914 916 917 918 920 924 925 926 927 928 929
## 1 1 1 6 1 3 1 3 1 2 4 1 2 1 1
## 930 931 932 935 936 938 939 940 941 944 945 948 949 951 952
## 2 1 3 3 6 4 3 1 3 1 2 1 1 1 5
## 953 957 958 960 961 963 964 966 967 968 969 970 971 972 974
## 2 1 1 5 1 1 1 1 3 1 2 5 2 2 1
## 975 976 977 978 982 983 988 989 992 993 994 995 996 998 999
## 2 1 1 1 2 1 1 1 1 1 2 1 1 1 1
## 1005 1007 1008 1010 1012 1013 1017 1018 1020 1022 1026 1028 1030 1032 1035
## 1 1 5 1 2 1 3 2 2 1 2 1 1 1 2
## 1040 1041 1042 1043 1045 1046 1048 1050 1052 1053 1054 1055 1057 1058 1063
## 4 1 1 2 1 2 2 1 1 1 2 2 1 1 1
## 1064 1065 1066 1068 1072 1073 1074 1075 1077 1078 1079 1081 1082 1084 1085
## 3 2 1 1 1 1 2 1 2 1 1 1 2 1 2
## 1087 1088 1090 1092 1093 1094 1095 1097 1098 1099 1100 1104 1105 1107 1108
## 1 2 2 2 2 1 3 1 1 1 4 1 1 1 1
## 1114 1115 1116 1117 1118 1120 1121 1122 1125 1126 1128 1129 1131 1139 1140
## 3 1 1 2 1 1 1 1 1 1 2 3 1 2 3
## 1141 1143 1144 1145 1146 1150 1152 1153 1158 1162 1163 1164 1168 1173 1176
## 2 3 1 1 1 1 2 1 3 1 1 1 1 1 1
## 1179 1180 1181 1184 1191 1194 1195 1198 1200 1203 1204 1209 1211 1212 1214
## 1 1 2 1 2 1 2 2 1 1 2 1 1 1 1
## 1216 1217 1218 1221 1226 1228 1232 1234 1237 1240 1242 1246 1248 1249 1250
## 1 1 2 1 2 2 2 2 2 2 1 1 1 1 1
## 1251 1252 1254 1257 1258 1262 1264 1266 1270 1272 1273 1276 1280 1284 1286
## 1 1 1 2 3 1 1 2 1 1 1 2 1 1 2
## 1288 1290 1293 1296 1298 1300 1302 1304 1307 1308 1313 1314 1316 1318 1319
## 2 2 1 1 1 1 2 1 1 1 2 1 2 1 1
## 1323 1324 1326 1327 1328 1330 1332 1335 1337 1339 1341 1342 1344 1346 1347
## 1 1 1 1 1 3 1 2 1 2 1 2 1 3 1
## 1348 1349 1351 1352 1357 1362 1367 1368 1369 1372 1373 1374 1375 1376 1380
## 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1
## 1390 1392 1393 1400 1402 1404 1405 1406 1410 1411 1413 1417 1420 1421 1422
## 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## 1424 1428 1430 1434 1436 1439 1442 1444 1450 1451 1453 1459 1461 1466 1468
## 2 1 1 1 2 1 1 1 2 2 1 1 1 1 4
## 1470 1473 1474 1480 1482 1486 1488 1489 1491 1494 1495 1496 1497 1498 1502
## 1 1 1 1 1 1 1 2 2 1 1 3 1 2 1
## 1503 1504 1508 1510 1515 1519 1526 1527 1528 1530 1541 1544 1550 1556 1558
## 1 1 1 1 1 1 1 1 3 4 1 1 2 1 1
## 1559 1560 1566 1568 1571 1573 1574 1580 1582 1584 1588 1590 1594 1595 1598
## 1 4 1 2 1 1 1 2 1 3 2 1 2 2 1
## 1602 1603 1604 1614 1615 1616 1618 1619 1622 1625 1626 1629 1630 1632 1638
## 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1
## 1640 1643 1649 1652 1656 1660 1664 1670 1678 1680 1685 1686 1689 1694 1696
## 1 2 1 1 2 1 1 1 1 2 1 1 2 3 1
## 1704 1706 1709 1710 1721 1726 1728 1734 1736 1752 1753 1765 1768 1774 1777
## 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
## 1794 1795 1800 1802 1824 1836 1851 1866 1869 1905 1907 1921 1926 1935 1958
## 1 1 3 1 2 2 1 1 1 1 1 1 1 1 1
## 1967 1969 2002 2042 2046 2062 2121 2140 2153 2336
## 1 1 1 1 1 1 1 1 1 1
all$BsmtUnfSF[c(2121,2189)]<-0
NaNum(all)
## SalePrice MasVnrType MasVnrArea MSZoning Utilities Functional
## 1459 24 23 4 2 2
## Exterior1st Exterior2nd Electrical KitchenQual SaleType
## 1 1 1 1 1
Imputing missing data: masonry
NaNum(all)
## SalePrice MasVnrType MasVnrArea MSZoning Utilities Functional
## 1459 24 23 4 2 2
## Exterior1st Exterior2nd Electrical KitchenQual SaleType
## 1 1 1 1 1
all$MasVnrArea[which(is.na(all$MasVnrType)&!is.na(all$MasVnrArea))]<-'None'
table(all$MasVnrType)
##
## BrkCmn BrkFace None Stone
## 25 879 1742 249
table(all$MasVnrArea)
##
## 0 1 100 101 102 1031 104 1047 105 1050 106 108 109 1095 11
## 1738 3 5 3 2 1 4 1 2 1 7 11 1 1 1
## 110 1110 1115 112 1129 113 114 115 1159 116 117 1170 118 119 120
## 3 1 1 6 1 3 2 3 1 3 2 1 1 2 15
## 121 122 1224 123 124 125 126 127 128 1290 130 132 134 135 136
## 1 3 2 3 1 3 4 1 9 1 6 8 2 3 5
## 137 1378 138 14 140 141 142 143 144 145 146 147 148 149 150
## 1 1 2 4 7 1 2 6 11 6 2 2 5 4 5
## 151 153 154 156 157 158 16 160 1600 161 162 163 164 165 166
## 1 3 1 3 3 3 11 5 1 3 5 2 7 3 4
## 167 168 169 170 171 172 174 175 176 177 178 179 18 180 182
## 1 5 3 8 2 5 7 1 13 1 8 1 3 12 5
## 183 184 186 187 188 189 190 192 194 196 197 198 199 20 200
## 4 3 7 1 3 3 3 4 5 9 1 5 1 4 13
## 202 203 204 205 206 207 208 209 210 212 214 215 216 217 218
## 2 7 2 3 5 1 3 2 9 4 1 3 12 1 3
## 219 22 220 221 222 223 224 225 226 227 228 229 23 230 232
## 1 2 4 1 1 1 1 1 4 2 2 1 4 2 6
## 233 234 235 236 237 238 24 240 242 243 244 245 246 247 248
## 2 2 1 3 1 4 2 7 4 2 2 2 6 1 4
## 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
## 4 1 7 1 2 1 8 1 2 2 7 2 1 1 3
## 265 266 268 27 270 272 274 275 276 278 279 28 280 281 283
## 2 2 5 1 7 5 1 3 1 2 1 2 4 2 1
## 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
## 3 3 2 1 6 3 3 1 2 1 2 3 2 1 3
## 299 3 30 300 302 304 305 306 308 309 31 310 312 315 318
## 1 1 4 7 8 3 3 6 1 2 1 3 3 1 2
## 32 320 322 323 324 327 328 332 333 335 336 337 338 34 340
## 4 7 1 1 1 1 2 1 1 2 4 1 2 1 10
## 342 344 348 350 351 352 353 355 356 359 36 360 361 362 364
## 2 2 1 3 2 2 1 1 2 2 2 7 1 2 2
## 365 366 368 370 371 372 375 376 378 379 38 380 381 382 383
## 2 2 2 1 1 1 1 1 2 1 2 2 1 1 2
## 385 387 388 39 391 394 396 397 399 40 400 402 405 406 408
## 1 1 1 1 1 1 1 1 1 8 1 2 1 1 1
## 41 410 412 415 418 42 420 422 423 424 425 426 428 430 432
## 3 2 1 1 1 3 7 2 3 2 3 1 1 2 2
## 434 435 436 438 44 440 442 443 444 448 45 450 451 452 456
## 1 1 1 1 7 1 3 1 1 1 3 4 1 1 7
## 459 46 464 466 468 47 470 472 473 479 48 480 481 491 492
## 1 1 1 3 2 1 1 3 3 1 1 4 1 1 2
## 495 50 500 501 502 504 506 509 51 510 513 514 515 518 519
## 1 7 2 1 1 6 2 1 3 2 5 1 1 1 1
## 52 522 525 526 528 53 530 532 54 541 549 550 554 56 562
## 3 1 2 1 1 2 1 1 4 1 1 1 3 2 1
## 564 567 568 57 571 572 573 576 579 58 584 594 60 600 603
## 1 2 2 1 1 1 1 1 1 2 1 1 7 3 1
## 604 615 616 62 621 63 630 632 634 64 640 647 65 650 651
## 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1
## 652 653 657 66 660 662 664 668 67 673 674 68 680 69 692
## 1 1 1 2 2 1 1 1 2 1 2 5 1 1 1
## 70 705 710 714 72 724 726 730 731 734 738 74 748 75 754
## 4 1 1 1 11 1 1 1 1 1 1 4 1 2 1
## 76 760 762 766 768 771 772 788 796 80 81 816 82 84 85
## 7 1 1 1 1 1 1 1 1 9 1 1 5 7 4
## 86 860 87 870 877 88 886 89 894 90 902 91 92 921 922
## 3 1 1 1 1 5 1 2 1 6 1 1 2 1 1
## 94 945 95 96 97 970 975 98 99 None
## 4 1 3 4 1 1 1 5 4 1
all$MasVnrArea[which(is.na(all$MasVnrArea))]<-0
all$MasVnrType[which(is.na(all$MasVnrType))]<-'None'
table(all$MasVnrType)
##
## BrkCmn BrkFace None Stone
## 25 879 1766 249
detach("package:plyr", unload=TRUE)
all[!is.na(SalePrice), ]%>%group_by(MasVnrType)%>%
summarise(median=median(SalePrice),count=n())%>%arrange(median)
#由此可知,none和BrkCmn對價錢沒什麼影響力,BrkFace和Stone則影響著價錢
#有order, 化作integer
Masonry<-c('BrkCmn'=0,'None'=0,'BrkFace'=1,'Stone'=2)
library(plyr)
all$MasVnrType<-as.integer(plyr::revalue(all$MasVnrType,Masonry))
table(all$MSZoning)
##
## C (all) FV RH RL RM
## 25 139 26 2265 460
sum(is.na(all$MSZoning))
## [1] 4
all$MSZoning[is.na(all$MSZoning)]<-names(sort(-table(all$MSZoning),increasing=TRUE))[1]
all$MSZoning<-as.factor(all$MSZoning)
sum(table(all$MSZoning))
## [1] 2919
NaNum(all)
## SalePrice Utilities Functional Exterior1st Exterior2nd Electrical
## 1459 2 2 1 1 1
## KitchenQual SaleType
## 1 1
table(all$KitchenQual)
##
## Ex Fa Gd TA
## 205 70 1151 1492
all$KitchenQual[is.na(all$KitchenQual)]<-names(sort(-table(all$KitchenQual),increasing=TRUE))[1]
all$KitchenQual<-as.integer(plyr::revalue(all$KitchenQual,Qualities))
####
class(all$KitchenAbvGr)
## [1] "numeric"
NaNum(all)
## SalePrice Utilities Functional Exterior1st Exterior2nd Electrical
## 1459 2 2 1 1 1
## SaleType
## 1
table(Utilities)
## Utilities
## AllPub NoSeWa
## 2916 1
all$Utilities[is.na(all$Utilities)]<-names(sort(-table(all$Utilities),increasing=TRUE))[1]
#However, the table below shows that only one house does not have all public utilities. This house is in the train set. Therefore, imputing ‘AllPub’ for the NAs means that all houses in the test set will have ‘AllPub’. This makes the variable useless for prediction. Consequently, I will get rid of it.
kable(all[is.na(all$Utilities)|all$Utilities=='NoSeWa',1:9])
| 20 |
RL |
82 |
14375 |
Pave |
None |
2 |
Lvl |
NoSeWa |
all$Utilities<- NULL
##當整個column的value都一樣時,這個column就沒用了
Imputing missing data: HomeFunctionality
NaNum(all)
## SalePrice Functional Exterior1st Exterior2nd Electrical SaleType
## 1459 2 1 1 1 1
table(all$Functional)
##
## Maj1 Maj2 Min1 Min2 Mod Sev Typ
## 19 9 65 70 35 2 2717
all$Functional[is.na(all$Functional)]<-names(sort(-table(all$Functional),increasing=TRUE))[1]
all$Functional<-as.integer(plyr::revalue(all$Functional, c('Sal'=0,'Sev'=1, 'Maj2'=2, 'Maj1'=3, 'Mod'=4,'Min2'=5, 'Min1'=6,'Typ'=7)))
NaNum(all)
## SalePrice Exterior1st Exterior2nd Electrical SaleType
## 1459 1 1 1 1
table(all$ExterQual)
##
## Ex Fa Gd TA
## 107 35 979 1798
all$ExterQual<-as.integer(plyr::revalue(all$ExterQual, Qualities))
###
table(all$ExterCond)
##
## Ex Fa Gd Po TA
## 12 67 299 3 2538
all$ExterCond<-as.integer(plyr::revalue(all$ExterCond, Qualities))
###
table(all$Exterior2nd)
##
## AsbShng AsphShn Brk Cmn BrkFace CBlock CmentBd HdBoard ImStucc MetalSd
## 38 4 22 47 3 126 406 15 447
## Other Plywood Stone Stucco VinylSd Wd Sdng Wd Shng
## 1 270 6 47 1014 391 81
all$Exterior2nd[is.na(all$Exterior2nd)]<-names(sort(-table(all$Exterior2nd),increasing=TRUE))[1]
###
table(all$Exterior1st)
##
## AsbShng AsphShn BrkComm BrkFace CBlock CemntBd HdBoard ImStucc MetalSd
## 44 2 6 87 2 126 442 1 450
## Plywood Stone Stucco VinylSd Wd Sdng WdShing
## 221 2 43 1025 411 56
all$Exterior1st[is.na(all$Exterior1st)]<-names(sort(-table(all$Exterior1st),increasing=TRUE))[1]
NaNum(all)
## SalePrice Electrical SaleType
## 1459 1 1
table(all$Electrical)
##
## FuseA FuseF FuseP Mix SBrkr
## 188 50 8 1 2671
all$Electrical[is.na(all$Electrical)]<-names(sort(-table(all$Electrical),incresing=TRUE))[1]
NaNum(all)
## SalePrice SaleType
## 1459 1
table(all$SaleType)
##
## COD Con ConLD ConLI ConLw CWD New Oth WD
## 87 5 26 9 8 12 239 7 2525
all$SaleType[is.na(all$SaleType)]<-names(sort(-table(all$SaleType),increasing=TRUE))[1]
###
table(all$SaleCondition)
##
## Abnorml AdjLand Alloca Family Normal Partial
## 190 12 24 46 2402 245
detach("package:plyr", unload=TRUE)
all[!is.na(all$SalePrice),]%>%group_by(SaleCondition)%>%
summarise(median=median(SalePrice),count=n())%>%arrange(median)
all$SaleType<-as.factor(all$SaleType)
#思考一下要不要換成ordinal
Label encoding/factorizing the remaining chactogrical variables
##還有幾個variable是charactor
Charcol<- names(all[,sapply(all,is.character)])
##寫成function
ChaNum<-function(df){
names(df[,sapply(df,is.character)])
}
cat('There are', length(Charcol), 'remaining catagorical variables in the data')
## There are 21 remaining catagorical variables in the data
Charcol
## [1] "Street" "LandContour" "LotConfig" "LandSlope"
## [5] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [9] "HouseStyle" "RoofStyle" "RoofMatl" "Exterior1st"
## [13] "Exterior2nd" "MasVnrArea" "Foundation" "Heating"
## [17] "HeatingQC" "CentralAir" "Electrical" "PavedDrive"
## [21] "SaleCondition"
Factorizing character: Foundation
all$Foundation<-as.factor(all$Foundation)
table(all$Foundation)
##
## BrkTil CBlock PConc Slab Stone Wood
## 311 1235 1308 49 11 5
sum(table(all$Foundation))
## [1] 2919
Factorizing character: Heating and airco
all$Heating<-as.factor(all$Heating)
table(all$Heating)
##
## Floor GasA GasW Grav OthW Wall
## 1 2874 27 9 2 6
sum(table(all$Heating))
## [1] 2919
###
all$HeatingQC<-as.integer(plyr::revalue(all$HeatingQC,Qualities))
table(all$HeatingQC)
##
## 1 2 3 4 5
## 3 92 857 474 1493
sum(table(all$HeatingQC))
## [1] 2919
###
#可以化為布林,不要放過
all$CentralAir<-as.integer(plyr::revalue(all$CentralAir,c('N'=0,'Y'=1)))
table(all$CentralAir)
##
## 0 1
## 196 2723
sum(table(CentralAir))
## [1] 2919
Factorizing character: Roof
ChaNum(all)
## [1] "Street" "LandContour" "LotConfig" "LandSlope"
## [5] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [9] "HouseStyle" "RoofStyle" "RoofMatl" "Exterior1st"
## [13] "Exterior2nd" "MasVnrArea" "Electrical" "PavedDrive"
## [17] "SaleCondition"
all$RoofMatl<-as.factor(all$RoofMatl)
table(all$RoofMatl)
##
## ClyTile CompShg Membran Metal Roll Tar&Grv WdShake WdShngl
## 1 2876 1 1 1 23 9 7
sum(table(all$RoofMatl))
## [1] 2919
###
all$RoofStyle<-as.factor(all$RoofStyle)
table(all$RoofStyle)
##
## Flat Gable Gambrel Hip Mansard Shed
## 20 2310 22 551 11 5
length(all$RoofStyle)
## [1] 2919
Factorizing character: Land,neighborhood
ChaNum(all)
## [1] "Street" "LandContour" "LotConfig" "LandSlope"
## [5] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [9] "HouseStyle" "Exterior1st" "Exterior2nd" "MasVnrArea"
## [13] "Electrical" "PavedDrive" "SaleCondition"
##不能像下面這樣寫(一次把所有的column叫出來factorize),會把全部都變成intweger(查一下為什麼!!!)
####str(all[,c("LandContour","LandSlope","Neighborhood")])
####all[,c("LandContour","LandSlope","Neighborhood")]<-c(as.factor(all$LandContour),as.factor(all$LandSlope),as.factor(all$Neighborhood))
####str(all[,c("LandContour","LandSlope","Neighborhood")])
table(all$LandContour)
##
## Bnk HLS Low Lvl
## 117 120 60 2622
ContourType<-c('Bnk'=1,'HLS'=2,'Low'=3,'Lvl'=4)
all$LandContour<-as.integer(plyr::revalue(all$LandContour,ContourType))
sum(table(all$LandContour))
## [1] 2919
###
table(all$LandSlope)
##
## Gtl Mod Sev
## 2778 125 16
all$LandSlope<-as.integer(plyr::revalue(all$LandSlope,c('Sev'=0,'Mod'=1,'Gtl'=2)))
sum(table(all$LandSlope))
## [1] 2919
###
table(all$Neighborhood)
##
## Blmngtn Blueste BrDale BrkSide ClearCr CollgCr Crawfor Edwards Gilbert
## 28 10 30 108 44 267 103 194 165
## IDOTRR MeadowV Mitchel NAmes NoRidge NPkVill NridgHt NWAmes OldTown
## 93 37 114 443 71 23 166 131 239
## Sawyer SawyerW Somerst StoneBr SWISU Timber Veenker
## 151 125 182 51 48 72 24
all$Neighborhood<-as.factor(all$Neighborhood)
sum(table(all$Neighborhood))
## [1] 2919
class(all$Neighborhood)
## [1] "factor"
###
table(all$Condition1)
##
## Artery Feedr Norm PosA PosN RRAe RRAn RRNe RRNn
## 92 164 2511 20 39 28 50 6 9
all$Condition1<-as.factor(all$Condition1)
###
table(all$Condition2)
##
## Artery Feedr Norm PosA PosN RRAe RRAn RRNn
## 5 13 2889 4 4 1 1 2
all$Condition2<-as.factor(all$Condition2)
Factorizing character: Dwelling
ChaNum(all)
## [1] "Street" "LotConfig" "BldgType" "HouseStyle"
## [5] "Exterior1st" "Exterior2nd" "MasVnrArea" "Electrical"
## [9] "PavedDrive" "SaleCondition"
table(all$BldgType)
##
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## 2425 62 109 96 227
##注意,給定絕對路徑後,x,y就不會和指定的data掛鉤,這樣是不行的:eg: ggplot(all[!is.na(SalePrice),],aes(all$BldgType, all$SalePrice))
ggplot(all[!is.na(SalePrice),],aes(BldgType, SalePrice))+
geom_bar(stat='summary', fun.y='median',fill='blue')+
scale_y_continuous(breaks=seq(0,200000,50000),labels=comma)+
geom_label(stat='count',aes(label=..count..,y=..count..))

all[!is.na(SalePrice),]%>%group_by(BldgType)%>%summarise(median=median(SalePrice), count=n())%>%arrange(median)
##no ordinality???
all$BldgType<- as.factor(all$BldgType)
class(all$BldgType)
## [1] "factor"
Factorizing character: Pavement of street and drive way
table(all$Street)
##
## Grvl Pave
## 12 2907
all$Street<-as.integer(plyr::revalue(all$Street,c('Grvl'=0,'Pave'=1)))
##
table(all$PavedDrive)
##
## N P Y
## 216 62 2641
all$PavedDrive<-as.integer(plyr::revalue(all$PavedDrive, c('N'=0,'P'=1,'Y'=2)))
At this point, all variables are complete (No NAs), and all character variables are converted into either numeric labels of into factors. However, there are 3 variables that are recorded numeric but should actually be categorical.
Changing some numeric variables into factors:Year and Month Sold
table(all$YrSold)
##
## 2006 2007 2008 2009 2010
## 619 692 622 647 339
str(all$YrSold)
## num [1:2919] 2008 2007 2008 2006 2008 ...
table(all$MoSold)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 122 133 232 279 394 503 446 233 158 173 142 104
str(all$MoSold)
## num [1:2919] 2 5 9 2 12 10 8 11 4 1 ...
all$MoSold<-as.factor(all$MoSold)
###
#Although possible a bit less steep than expected, the effects of the Banking crises that took place at the end of 2007 can be seen indeed. After the highest median prices in 2007, the prices gradually decreased. However, seasonality seems to play a bigger role, as you can see below.
ys <- ggplot(all[!is.na(all$SalePrice),], aes(x=as.factor(YrSold), y=SalePrice)) +
geom_bar(stat='summary', fun.y = "median", fill='blue')+
scale_y_continuous(breaks= seq(0, 800000, by=25000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..)) +
coord_cartesian(ylim = c(0, 200000)) +
geom_hline(yintercept=163000, linetype="dashed", color = "red") #dashed line is median SalePrice
ys<-ggplot(all[!is.na(all$SalePrice),],aes(as.factor(YrSold),SalePrice))+
geom_bar(stat='summary',fun.y='median',fill='blue')+xlab('YrSold')+
scale_y_continuous(breaks=seq(0,800000,25000),labels=comma)+
geom_label(stat='count',aes(label=..count..,y=..count..))+
coord_cartesian(ylim=c(0,200000))+
geom_hline(yintercept=163000,linetype='dashed',color='red')
#?coord_cartesian
ms<-ggplot(all[!is.na(all$SalePrice),],aes(as.factor(MoSold),SalePrice))+
geom_bar(stat="summary", fun.y='median',fill='blue')+xlab('MoSold')+
scale_y_continuous(breaks = seq(0,800000,25000),labels=comma)+
geom_label(stat='count', aes(label=..count..,y=..count..))+
coord_cartesian(ylim=c(0,200000))+
geom_hline(yintercept=163000,linetype='dashed',color='red')
##stat=summary的功能類似group by + summary, fuy.y表示要在y做什麼事,所以就是group by x軸,針對subset 做fun.y的計算
##這裡是做median,比較下面這行的結果和圖中1月的值
#median(all$SalePrice[all$MoSold==1&!is.na(all$SalePrice)])
grid.arrange(ys,ms,widths=c(1,2))

##grid.arrange把兩張圖一起陳列,width=c(1,2)表示陳列為一個row,兩個col(亦即擺在同一行裡面看,不要分兩行)
#####房價被2007年的次貸風暴影響,但看似月份影響比較大
Changing some numeric variables into factors:MSSubClass
table(all$MSSubClass)
##
## 20 30 40 45 50 60 70 75 80 85 90 120 150 160 180
## 1079 139 6 18 287 575 128 23 118 48 109 182 1 128 17
## 190
## 61
str(all$MSSubClass)
## num [1:2919] 60 20 60 70 60 50 20 60 50 190 ...
all$MSSubClass<-as.factor(all$MSSubClass)
all$MSSubClass<-plyr::revalue(all$MSSubClass,c('20'='1 story 1946+', '30'='1 story 1945-', '40'='1 story unf attic', '45'='1,5 story unf', '50'='1,5 story fin', '60'='2 story 1946+', '70'='2 story 1945-', '75'='2,5 story all ages', '80'='split/multi level', '85'='split foyer', '90'='duplex all style/age', '120'='1 story PUD 1946+', '150'='1,5 story PUD all', '160'='2 story PUD 1946+', '180'='PUD multilevel', '190'='2 family conversion'))
Visualization of important variables
# I have now finally reached the point where all character variables have been converted into categorical factors or have been label encoded into numbers. In addition, 3 numeric variables have been converted into factors, and I deleted one variable (Utilities). As you can see below, the number of numerical variables is now 56 (including the response variable), and the remaining 23 variables are categorical.
numericVars<-which(sapply(all,is.numeric))
factorVars<-which(sapply(all,is.factor))
cat('There are', length(numericVars), 'numeric variables, and', length(factorVars), 'categoric variables')
## There are 56 numeric variables, and 16 categoric variables
Correlations again
all_numVar<-all[,numericVars]
cor_numVar<-cor(all_numVar,use='pairwise.complete.obs')
cor_sorted<-as.matrix(sort(cor_numVar[,"SalePrice"],decreasing = TRUE))
CorHigh<-names(which(apply(cor_sorted,1,function(x) abs(x)>0.5)))
cor_numVar<-cor_numVar[CorHigh,CorHigh]
corrplot.mixed(cor_numVar,tl.col='black',tl.pos='lt',tl.cex=0.7,cl.cex=0.7,number.cex=0.7)

#?corrplot.mixed
Finding variable importance with a quick Random Forest
# Although the correlations are giving a good overview of the most important numeric variables and multicolinerity among those variables, I wanted to get an overview of the most important variables including the categorical variables before moving on to visualization.
# I tried to get the relative importance of variables with a quick linear regression model with the calc.relimp function of package , and also tried the boruta function of package boruta which separates the variables into groups that are important or not. However, these method took a long time. As I only want to get an indication of the variable importance, I eventually decided to keep it simple and just use a quick and dirty Random Forest model with only 100 trees. This also does the job for me, and does not take very long as I can specify a (relatively) small number of trees.
nrow(all)
## [1] 2919
###facotrize all character,不然不能random forest
ChaNum(all)
## [1] "LotConfig" "HouseStyle" "Exterior1st" "Exterior2nd"
## [5] "MasVnrArea" "Electrical" "SaleCondition"
all$MSZoning<-as.factor(all$MSZoning)
all$LotConfig<-as.factor(all$LotConfig)
all$HouseStyle<-as.factor(all$HouseStyle)
all$Exterior1st<-as.factor(all$Exterior1st)
all$Exterior2nd<-as.factor(all$Exterior2nd)
all$Electrical<-as.factor(all$Electrical)
all$SaleCondition<-as.factor(all$SaleCondition)
###下面這邊,要查
set.seed(2018)
quick_RF <- randomForest(x=all[1:1460,-79], y=all$SalePrice[1:1460], ntree=100,importance=TRUE)
imp_RF<-importance(quick_RF)
imp_DF<-data.frame(Variables=row.names(imp_RF),MSE=imp_RF[,1])
imp_DF<-imp_DF[order(imp_DF$MSE,decreasing=TRUE),]
ggplot(imp_DF[1:20,],aes(reorder(Variables,MSE),MSE,fill=MSE))+
geom_bar(stat='identity')+labs(x='Variables',y='% increase MSE if Variable is randomly permuted')+
coord_flip()+theme(legend.position='none')

# Only 3 of those most important variables are categorical according to RF; Neighborhood, MSSubClass, and GarageType.
Above Ground Living Area, and other surface related variables (in square feet)
# As I have already visualized the relation between the Above Ground Living Area and SalePrice in my initial explorations, I will now just display the distribution itself. As there are more ‘square feet’ surface measurements in the Top 20, I am taking the opportunity to bundle them in this section. Note: GarageArea is taken care of in the Garage variables section.
# I am also adding ‘Total Rooms Above Ground’ (TotRmsAbvGrd) as this variable is highly correlated with the Above Ground Living Area(0.81).
######???想一下為什麼這樣選和畫
s1<-ggplot(all,aes(GrLivArea))+geom_density()+labs(x='Square feet living area')
s2<-ggplot(all, aes(as.factor(TotRmsAbvGrd)))+geom_histogram(stat='count')+labs(x='Rooms above Ground')
s3<-ggplot(all,aes(`1stFlrSF`))+geom_density()+labs(x='Square feet first floor')
s4<-ggplot(all,aes(`2ndFlrSF`))+geom_density()+labs(x='Square feet second floor')
s5<-ggplot(all,aes(TotalBsmtSF))+geom_density()+labs(x='Square feet basement')
s6<-ggplot(all[all$LotArea<100000,],aes(LotArea))+geom_density()+labs(x='Square feet lot')
s7<-ggplot(all,aes(LotFrontage))+geom_density()+labs(x='Linear feet lot frontage')
s8<-ggplot(all,aes(LowQualFinSF))+geom_histogram()+labs(x='Low quality square feet 1st & 2nd')
######???查!!!!!
layout<-matrix(c(1,2,5,3,4,8,6,7),4,2,byrow=TRUE)
Rmisc::multiplot(s1,s2,s3,s4,s5,s6,s7,s8,layout=layout)

#####
# I will investigate several of these variables for outliers later on. For the lot visualization, I have already taken out the lots above 100,000 square feet (4 houses).
# GrLivArea seemed to be just the total of square feet 1st and 2nd floor. However, in a later version, I discovered that there is also a variable called: LowQualFinSF: Low quality finished square feet (all floors). As you can see above (Low quality square feet 1st and 2nd) almost all houses have none of this (only 40 houses do have some). It turns out that these square feet are actually included in the GrLivArea. The correlation between those 3 variables and GrLivArea is exactely 1.
###利用cor來確定var的主從關係
cor(all$GrLivArea, (all$`1stFlrSF`+all$`2ndFlrSF`+all$LowQualFinSF))
## [1] 1
head(all[all$LowQualFinSF>0, c('GrLivArea', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF')])
The most important categorical variable; Neighborhood
median(all$SalePrice[!is.na(all$SalePrice)])
## [1] 163000
n1<-ggplot(all[!is.na(all$SalePrice),], aes(Neighborhood,SalePrice))+
geom_bar(stat='summary',fun.y='median',fill='blue')+
theme(axis.text.x=element_text(angle=45,hjust = 1))+
scale_y_continuous(breaks=seq(0,800000,50000),labels=comma)+
geom_label(stat='count',aes(label=..count..,y=..count..),size=3)+
geom_hline(yintercept=163000,linetype='dashed',color='red') #dashed line is median SaleP
n2<-ggplot(all,aes(Neighborhood))+
geom_histogram(stat='count')+
geom_label(stat='count',aes(label=..count..,y=..count..),size=3)+
theme(axis.text.x=element_text(angle=45,hjust=1))
grid.arrange(n1,n2)

Overall Quality, and other Quality variables
q1<-ggplot(all,aes(as.factor(OverallQual)))+
geom_histogram(stat='count')
q2<-ggplot(all,aes(as.factor(ExterQual)))+
geom_histogram(stat='count')
q3<-ggplot(all,aes(as.factor(BsmtQual)))+
geom_histogram(stat='count')
q4<-ggplot(all,aes(as.factor(KitchenQual)))+
geom_histogram(stat='count')
q5<-ggplot(all,aes(as.factor(GarageQual)))+
geom_histogram(stat = 'count')
q6<-ggplot(all,aes(as.factor(FireplaceQu)))+
geom_histogram(stat='count')
q7<-ggplot(all,aes(as.factor(PoolQC)))+
geom_histogram(stat='count')
layout <- matrix(c(1,2,8,3,4,8,5,6,7),3,3,byrow=TRUE)
Rmisc::multiplot(q1,q2,q3,q4,q5,q6,q7,layout = layout)

##layout要給matrix,顯示方式就是照matrix的排列來排,這裏layout放8在中間是因為放一個空的圖,因為有些row要放兩個,有些row要放三個
#Overall Quality is very important, and also more granular than the other variables. External Quality is also improtant, but has a high correlation with Overall Quality (0.73). Kitchen Quality also seems one to keep, as all houses have a kitchen and there is a variance with some substance. Garage Quality does not seem to distinguish much, as the majority of garages have Q3. Fireplace Quality is in the list of high correlations, and in the important variables list. The PoolQC is just very sparse (the 13 pools cannot even be seen on this scale). I will look at creating a ‘has pool’ variable later on.
The second most important categorical variable; MSSubClass
# The first visualization shows the median SalePrice by MSSubClass. The frequency (number of houses) of each MSSubClass in the train set is shown in the labels.
# The histrogram shows the frequencies across all data. Most houses are relatively new, and have one or two stories.
ms1<-ggplot(all[!is.na(all$SalePrice),],aes(MSSubClass,SalePrice))+
geom_bar(stat='summary',fun.y='median',fill='blue')+
theme(axis.text.x=element_text(angle=45,hjust=1))+
scale_y_continuous(breaks=seq(0,800000,50000),labels=comma)+
geom_label(stat='count',aes(label=..count..,y=..count..),size=3)+
geom_hline(yintercept=163000,linetype='dashed',color='red')#dashed line is median SalePrice
ms2<-ggplot(all,aes(MSSubClass))+
geom_histogram(stat='count')+
geom_label(stat='count',aes(label=..count..,y=..count..),size=3)+
theme(axis.text.x=element_text(angle = 45,hjust=1))
grid.arrange(ms1,ms2)

Garage variables
# Several Garage variables have a high correlation with SalePrice, and are also in the top-20 list of the quick random forest. However, there is multicolinearity among them and I think that 7 garage variables is too many anyway. I feel that something like 3 variables should be sufficient (possibly GarageCars, GarageType, and a Quality measurement), but before I do any selection I am visualizing all of them in this section.
#correct error
all$GarageType[2593]<-2007
#this must have been a typo. GarageYrBlt=2207, YearBuilt=2006, YearRemodAdd=2007.
g1<-ggplot(all[all$GarageCars!=0,],aes(GarageYrBlt))+geom_histogram()
g2<-ggplot(all,aes(as.factor(GarageCars)))+geom_histogram(stat='count')
g3<-ggplot(all,aes(GarageArea))+geom_density()
g4<-ggplot(all,aes(as.factor(GarageCond)))+geom_histogram(stat='count')
g5<-ggplot(all,aes(GarageType))+geom_histogram(stat='count')
g6<-ggplot(all,aes(as.factor(GarageQual)))+geom_histogram(stat='count')
g7<-ggplot(all,aes(as.factor(GarageFinish)))+geom_histogram(stat='count')
layout<- matrix(c(1,5,5,2,3,8,6,4,7),3,3,byrow=TRUE)
Rmisc::multiplot(g1,g2,g3,g4,g5,g6,g7,layout=layout)

##連續兩格放同一個圖,會變成一張圖佔兩格空間
# Similar the garage variables, multiple basement variables are important in the correlations matrix and the Top 20 RF predictors list. However, 11 basement variables seems an overkill. Before I decide what I am going to do with them, I am visualizing 8 of them below. The 2 “Bathroom” variables are dealt with in Feature Engineering (section 7.1), and the “Basement square feet” is already discussed in section 6.2.1.
b1 <- ggplot(all,aes(BsmtFinSF1))+geom_histogram()+labs(x='Type 1 finished square feet')
b2 <- ggplot(all,aes(BsmtFinSF2))+geom_histogram()+labs(x='Type 2 finished square feet')
b3 <- ggplot(all,aes(BsmtUnfSF))+geom_histogram()+labs(x='Unfinished square feet')
b4 <- ggplot(all,aes(as.factor(BsmtFinType1)))+geom_histogram(stat='count')+labs(x='Rating of Type 1 finished area')
b5 <- ggplot(all,aes(as.factor(BsmtFinType2)))+geom_histogram(stat='count')+labs(x='Rating of Type 2 finished area')
b6 <- ggplot(all,aes(as.factor(BsmtQual)))+geom_histogram(stat='count')+labs(x='Height of the basement')
b7 <- ggplot(all,aes(as.factor(BsmtCond)))+geom_histogram(stat='count')+labs(x='Rating of general condition')
b8 <- ggplot(all,aes(as.factor(BsmtExposure)))+ geom_histogram(stat='count')+labs(x='Walkout or garden level walls')
layout<-matrix(c(1,2,3,4,5,9,6,7,8),3,3,byrow=TRUE)
Rmisc::multiplot(b1,b2,b3,b4,b5,b6,b7,b8,layout=layout)

##So it seemed as if the Total Basement Surface in square feet (TotalBsmtSF) is further broken down into finished areas (2 if more than one type of finish), and unfinished area. I did a check between the correlation of total of those 3 variables, and TotalBsmtSF. The correlation is exactely 1, so that’s a good thing (no errors or small discrepancies)!
#Basement Quality is a confusing variable name, as it turns out that it specifically rates the Height of the basement.
all[,c('TotalBsmtSF',"BsmtFinSF1","BsmtFinSF2","BsmtUnfSF")]
cor(all$TotalBsmtSF,all$BsmtFinSF1+all$BsmtFinSF2+all$BsmtUnfSF)
## [1] 1
##TotalBsmtSF=BsmtFinSF1+BsmtFinSF2+BsmtUnfSF
Feature engineering
Total number of Bathrooms
# There are 4 bathroom variables. Individually, these variables are not very important. However, I assume that I if I add them up into one predictor, this predictor is likely to become a strong one.
# “A half-bath, also known as a powder room or guest bath, has only two of the four main bathroom components-typically a toilet and sink.” Consequently, I will also count the half bathrooms as half.
all$TotBathrooms<-all$FullBath+(all$HalfBath*0.5)+all$BsmtFullBath+(all$BsmtHalfBath*0.5)
# As you can see in the first graph, there now seems to be a clear correlation (it’s 0.63). The frequency distribution of Bathrooms in all data is shown in the second graph.
tb1 <- ggplot(all[!is.na(all$SalePrice),],aes(as.factor(TotBathrooms),SalePrice))+
geom_point(col='blue')+ geom_smooth(methods='lm',color='black',aes(group=1))+
#group=1????????
scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)
tb2 <- ggplot(all,aes(as.factor(TotBathrooms)))+
geom_histogram(stat='count')
grid.arrange(tb1,tb2)

Adding ‘House Age’, ‘Remodeled (Yes/No)’, and IsNew variables
# Altogether, there are 3 variables that are relevant with regards to the Age of a house; YearBlt, YearRemodAdd, and YearSold. YearRemodAdd defaults to YearBuilt if there has been no Remodeling/Addition. I will use YearRemodeled and YearSold to determine the Age. However, as parts of old constructions will always remain and only parts of the house might have been renovated, I will also introduce a Remodeled Yes/No variable. This should be seen as some sort of penalty parameter that indicates that if the Age is based on a remodeling date, it is probably worth less than houses that were built from scratch in that same year.
all$Remod<- ifelse(all$YearBuilt==all$YearRemodAdd,0,1) #0=No Remodeling, 1=Remodeling
all$Age<- as.numeric(all$YrSold)-all$YearRemodAdd
ggplot(all[!is.na(all$SalePrice),],aes(Age,SalePrice))+
geom_point(col='blue')+geom_smooth(method='lm',color='black',aes(group=1))+
scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)

# As expected, the graph shows a negative correlation with Age (old house are worth less).
cor(all$SalePrice[!is.na(all$SalePrice)],all$Age[!is.na(all$SalePrice)])
## [1] -0.5090787
# As you can see below, houses that are remodeled are worth less indeed, as expected.
ggplot(all[!is.na(SalePrice),],aes(as.factor(Remod),SalePrice))+
geom_bar(stat='summary',fun.y='median',fill='blue')+
geom_label(stat='count',aes(label=..count..,y=..count..),size=6)+
theme_grey(base_size = 18)+
geom_hline(yintercept=163000,linetype='dashed') #dashed line is median SalePrice

# Finally, I am creating the IsNew variable below. Altogether, there are 116 new houses in the dataset.
all$IsNew<- ifelse(all$YrSold==all$YearBuilt,1,0)
table(all$IsNew)
##
## 0 1
## 2803 116
# These 116 new houses are fairly evenly distributed among train and test set, and as you can see new houses are worth considerably more on average.
ggplot(all[!is.na(all$SalePrice),],aes(as.factor(IsNew),SalePrice))+
geom_bar(stat='summary',fun.y='median',fill='blue')+
geom_label(stat='count',aes(label=..count..,y=..count..),size=6)+
scale_y_continuous(breaks=seq(0,800000,50000),labels=comma)+
theme_grey(base_size=18)+
geom_hline(yintercept=163000,linetype='dashed') #dashed line is median SalePrice

all$YrSold <- as.factor(all$YrSold) #the numeric version is now not needed anymore
Binning Neighborhood
nb1<- ggplot(all[!is.na(all$SalePrice),],aes(reorder(Neighborhood,SalePrice,FUN=median),SalePrice))+
geom_bar(stat='summary',fun.y='median',fill='blue')+labs(x='Neighborhood',y='Medain SalePrice')+
theme(axis.text.x=element_text(angle=45,hjust=1))+
scale_y_continuous(breaks=seq(0,800000,50000),labels=comma)+
geom_hline(yintercept = 163000,linetype='dashed',color='red')+
geom_label(stat='count',aes(label=..count..,y=..count..),size=3)
nb2<- ggplot(all[!is.na(all$SalePrice),],aes(reorder(Neighborhood,SalePrice,FUN = mean),SalePrice))+
geom_bar(stat='summary', fun.y='mean', fill='blue')+labs(x='Neighborhood',y='Mean SalePrice')+
scale_y_continuous(breaks=seq(0,800000,50000),labels=comma)+
geom_label(stat='count',aes(label=..count..,y=..count..),size=3)+
theme(axis.text.x=element_text(angle=45,hjust=1))+
geom_hline(yintercept = 163000,linetype='dashed',color='red') #dashed line is median SalePric
grid.arrange(nb1,nb2)

# Both the median and mean Saleprices agree on 3 neighborhoods with substantially higher saleprices. The separation of the 3 relatively poor neighborhoods is less clear, but at least both graphs agree on the same 3 poor neighborhoods. Since I do not want to ‘overbin’, I am only creating categories for those ‘extremes’.
all$NeighRich[all$Neighborhood%in%c('StoneBr', 'NridgHt', 'NoRidge')]<-2
all$NeighRich[!all$Neighborhood%in%c('MeadowV', 'IDOTRR', 'BrDale', 'StoneBr', 'NridgHt', 'NoRidge')]<-1
all$NeighRich[all$Neighborhood%in%c('MeadowV', 'IDOTRR', 'BrDale')]<-0
sum(table(all$NeighRich))
## [1] 2919
Total Square Feet
# As the total living space generally is very important when people buy houses, I am adding a predictors that adds up the living space above and below ground.
all$TotalSqFeet<-all$GrLivArea+all$TotalBsmtSF
ggplot(all[!is.na(all$SalePrice),],aes(TotalSqFeet,SalePrice))+
geom_point(col='blue',alpha=0.3,position='jitter')+geom_smooth(method='lm',color='black',aes(group=1))+
scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)+
geom_text_repel(aes(label= ifelse(all$GrLivArea[!is.na(all$SalePrice)]>4500,rownames(all),'')))

# As expected, the correlation with SalePrice is very strong indeed (0.78).
cor(all$TotalSqFeet,all$SalePrice,use='pairwise.complete.obs')
## [1] 0.7789588
##上面這行等於這行cor(all$TotalSqFeet[!is.na(SalePrice)],all$SalePrice[!is.na(SalePrice)]),所以use的功用在於排除NA
#The two potential outliers seem to ‘outlie’ even more than before. By taking out these two outliers, the correlation increases by 5%.
cor(all$TotalSqFeet[-c(524,1299)],all$SalePrice[-c(524,1299)],use='pairwise.complete.obs')
## [1] 0.829042
Consolidating Porch variables
# Below, I listed the variables that seem related regarding porches.
# WoodDeckSF: Wood deck area in square feet
# OpenPorchSF: Open porch area in square feet
# EnclosedPorch: Enclosed porch area in square feet
# 3SsnPorch: Three season porch area in square feet
# ScreenPorch: Screen porch area in square feet
# As far as I know, porches are sheltered areas outside of the house, and a wooden deck is unsheltered. Therefore, I am leaving WoodDeckSF alone, and are only consolidating the 4 porch variables.
all$TotalPorchSF<-all$OpenPorchSF+all$EnclosedPorch+all$`3SsnPorch`+all$ScreenPorch
# Although adding up these Porch areas makes sense (there should not be any overlap between areas), the correlation with SalePrice is not very strong.
cor(all$TotalPorchSF,all$SalePrice,use='pairwise.complete.obs')
## [1] 0.1957389
ggplot(all[!is.na(all$SalePrice),],aes(TotalPorchSF,SalePrice))+
geom_point(col='blue',alpha=0.3)+
geom_smooth(method='lm',color='black',aes(group=1))+
scale_y_continuous(breaks=seq(0,800000,100000),labels=comma)

Preparing data for modeling
Removing outliers
all<-all[-c(524,1299),]
PreProcessing predictor variables
#Before modeling I need to center and scale the ‘true numeric’ predictors (so not variables that have been label encoded), and create dummy variables for the categorical predictors. Below, I am splitting the dataframe into one with all (true) numeric variables, and another dataframe holding the (ordinal) factors.
numericVarNames <- numericVarNames[!(numericVarNames%in%c('MSSubClass', 'MoSold', 'YrSold', 'SalePrice', 'OverallQual', 'OverallCond'))] #numericVarNames was created before having done anything
numericVarNames<-append(numericVarNames,c('Age', 'TotalPorchSF', 'TotBathrooms', 'TotalSqFeet'))
DFnumeric<-all[,names(all)%in%numericVarNames]
DFfactors<-all[,!(names(all)%in%numericVarNames)]
DFfactors<-DFfactors[,names(DFfactors)!='SalePrice']
cat('There are', length(DFnumeric), 'numeric variables, and', length(DFfactors), 'factor variables')
## There are 30 numeric variables, and 49 factor variables
Skewness and normalizing of the numeric predictors